# Create the database for the selected repeat stations

## Description

This program does the following: 
- It calculates the distance among each selected repeat station
- It creates the database file for the selected repeat stations

In [1]:
# Import modules
import mestrado_module as mm
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
# Repeat station IGRF database info
rs_igrf_folder: Path = Path(mm.path_pipeline_04_igrf_calc)
rs_igrf_file: Path = Path(mm.output_4e_code_complete_rs_igrf_database)

# Folium file
folium_file_folder: Path = Path(mm.path_pipeline_07_select_rs_geo_distribution)
folium_file: Path = Path(mm.output_7a_code_selected_rs_folium)

# Output file
output_folder: Path = Path(mm.path_pipeline_07_select_rs_geo_distribution)
selected_rs_database_file: Path = Path(mm.output_7b_code_selected_rs_db)
selected_database_table_ref: Path = Path(mm.output_7b_code_selected_rs_table)
distances_file: Path = Path(mm.output_7b_code_distance_file)

## Read the data

In [3]:
# Folium file with selected repeat stations
df_folium = pd.read_csv(folium_file_folder / folium_file)

# Sort values in ascending order by the Code column
df_folium = df_folium.drop(columns=["geometry"])
df_folium = df_folium.sort_values(by = "Code", ascending = True)
df_folium = df_folium.reset_index()
df_folium.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     50 non-null     int64  
 1   Code                      50 non-null     object 
 2   Lat_dd                    50 non-null     float64
 3   Lon_dd                    50 non-null     float64
 4   Alt_m                     50 non-null     float64
 5   Time_dy                   50 non-null     float64
 6   D_dd                      50 non-null     float64
 7   IGRF_D_dd                 50 non-null     float64
 8   I_dd                      50 non-null     float64
 9   IGRF_I_dd                 50 non-null     float64
 10  F_nT                      50 non-null     float64
 11  IGRF_F_nT                 50 non-null     float64
 12  H_nT                      50 non-null     float64
 13  IGRF_H_nT                 50 non-null     float64
 14  X_nT        

In [4]:
## Read the rs igrf database
df_rs_igrf = pd.read_csv(rs_igrf_folder / rs_igrf_file)
df_rs_igrf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Code                      1082 non-null   object 
 1   Lat_dd                    1082 non-null   float64
 2   Lon_dd                    1082 non-null   float64
 3   Alt_m                     1082 non-null   float64
 4   Time_dy                   1082 non-null   float64
 5   D_dd                      1082 non-null   float64
 6   IGRF_D_dd                 1082 non-null   float64
 7   I_dd                      1082 non-null   float64
 8   IGRF_I_dd                 1082 non-null   float64
 9   F_nT                      1082 non-null   float64
 10  IGRF_F_nT                 1082 non-null   float64
 11  H_nT                      1082 non-null   float64
 12  IGRF_H_nT                 1082 non-null   float64
 13  X_nT                      1082 non-null   float64
 14  IGRF_X_n

## Calculate the distances (km) between each selected repeat station

In [5]:
# Create new dataframe to work on
calc_distances = df_folium

# Calculate the number of stations in this category
list_stations = calc_distances.Code.unique() 
list_stations_size = len(list_stations)
#print(list_stations)
print(list_stations_size)

# Create a new dataframe to hold the calculated distances in km
rs_distances = pd.DataFrame(columns=[list_stations], index=range(list_stations_size))

# Calculate the distances (FIND A BETTER WAY TO DO THIS LATER), result in km
for i in range(list_stations_size):
    rs_distances[list_stations[i]] = mm.haversine_array(calc_distances["Lon_dd"], calc_distances["Lat_dd"], calc_distances.loc[i].at["Lon_dd"], calc_distances.loc[i].at["Lat_dd"])
    
    
# Add another column with the stations code to server as visual index
rs_distances.insert(loc = 0, column="Stations (distances in km)", value = list_stations)

# Save it to a file
rs_distances.to_csv(output_folder / distances_file, index=False, float_format="%.3f", na_rep="NaN")

# View it
#rs_distances

50


In [6]:
# This organize the dataframe into the lowest to highest distance between stations for the selected column (meaning station)
# the "chosen_station" variable the station you want ([0] is the first column/station), then it shows all the stations closest to it in decrescent order (closer and then farther)
# The rows variable indicates how many rows you want to see (meaning how many stations do you want to know the distance of in relation to the chosen station "chosen_station")
chosen_station = [list_stations[0]]
rows = list_stations_size
lowest_dist = rs_distances.nsmallest(rows, rs_distances[chosen_station])
#lowest_dist

## Create a list with the selected repeat stations to use in the database creation

In [7]:
selected_rs_list = df_folium["Code"].tolist()
selected_rs_list_size = len(selected_rs_list)
#selected_rs_list

## Create the dataframe to hold the selected repeat stations database

In [8]:
# create the df
selected_df = df_rs_igrf.loc[df_rs_igrf['Code'].isin(selected_rs_list)]
selected_df = selected_df.reset_index()
selected_df = selected_df.drop(columns=["index"])

# Save it
selected_df.to_csv(output_folder / selected_rs_database_file, index=False, float_format="%.3f", na_rep="NaN")

# View it
selected_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447 entries, 0 to 446
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Code                      447 non-null    object 
 1   Lat_dd                    447 non-null    float64
 2   Lon_dd                    447 non-null    float64
 3   Alt_m                     447 non-null    float64
 4   Time_dy                   447 non-null    float64
 5   D_dd                      447 non-null    float64
 6   IGRF_D_dd                 447 non-null    float64
 7   I_dd                      447 non-null    float64
 8   IGRF_I_dd                 447 non-null    float64
 9   F_nT                      447 non-null    float64
 10  IGRF_F_nT                 447 non-null    float64
 11  H_nT                      447 non-null    float64
 12  IGRF_H_nT                 447 non-null    float64
 13  X_nT                      447 non-null    float64
 14  IGRF_X_nT 

## Create a sorted dataframe to hold the selected repeat stations database

In [9]:
selected_df

Unnamed: 0,Code,Lat_dd,Lon_dd,Alt_m,Time_dy,D_dd,IGRF_D_dd,I_dd,IGRF_I_dd,F_nT,...,RMSE_D_Original_values,RMSE_I_Original_values,RMSE_F_Original_values,RMSE_H_Original_values,RMSE_X_Original_values,RMSE_X_Calculated_values,RMSE_Y_Original_values,RMSE_Y_Calculated_values,RMSE_Z_Original_values,RMSE_Z_Calculated_values
0,AC_CZS,-7.637,-72.670,182.464,1958.529,2.683,2.783,11.281,11.167,29671.0,...,0.178,0.204,50.308,36.257,37.605,37.680,83.892,84.130,105.801,106.271
1,AC_CZS,-7.637,-72.670,182.464,1965.848,1.824,1.917,11.277,10.917,29227.0,...,0.178,0.204,50.308,36.257,37.605,37.680,83.892,84.130,105.801,106.271
2,AC_CZS,-7.620,-72.670,195.600,1978.640,-0.035,0.150,11.026,10.800,28359.0,...,0.178,0.204,50.308,36.257,37.605,37.680,83.892,84.130,105.801,106.271
3,AC_CZS,-7.599,-72.770,196.508,1986.279,-1.343,-1.083,10.685,10.550,27886.0,...,0.178,0.204,50.308,36.257,37.605,37.680,83.892,84.130,105.801,106.271
4,AC_CZS,-7.599,-72.770,196.508,1989.503,-1.847,-1.633,10.468,10.317,27683.0,...,0.178,0.204,50.308,36.257,37.605,37.680,83.892,84.130,105.801,106.271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,RS_SMA,-29.683,-53.823,107.905,1960.208,-6.616,-6.333,-26.516,-26.533,24396.0,...,0.241,0.197,51.595,68.178,69.163,69.214,89.968,90.235,76.267,75.942
443,RS_SMA,-29.687,-53.835,90.518,1965.338,-7.381,-7.300,-27.114,-27.367,24200.0,...,0.241,0.197,51.595,68.178,69.163,69.214,89.968,90.235,76.267,75.942
444,RS_SMA,-29.687,-53.835,90.518,1976.218,-9.126,-8.967,-28.829,-29.067,23735.0,...,0.241,0.197,51.595,68.178,69.163,69.214,89.968,90.235,76.267,75.942
445,RS_SMA,-29.717,-53.700,85.200,1982.249,-10.361,-10.067,-30.357,-30.350,23521.0,...,0.241,0.197,51.595,68.178,69.163,69.214,89.968,90.235,76.267,75.942


In [10]:
table_df = selected_df.drop_duplicates(subset="Code", keep="last", inplace=False)
table_df = table_df.reset_index()
table_df = table_df.drop(columns = "index")
table_df

Unnamed: 0,Code,Lat_dd,Lon_dd,Alt_m,Time_dy,D_dd,IGRF_D_dd,I_dd,IGRF_I_dd,F_nT,...,RMSE_D_Original_values,RMSE_I_Original_values,RMSE_F_Original_values,RMSE_H_Original_values,RMSE_X_Original_values,RMSE_X_Calculated_values,RMSE_Y_Original_values,RMSE_Y_Calculated_values,RMSE_Z_Original_values,RMSE_Z_Calculated_values
0,AC_CZS,-7.599,-72.77,196.508,1995.869,-2.901,-2.75,9.951,9.833,27342.0,...,0.178,0.204,50.308,36.257,37.605,37.68,83.892,84.13,105.801,106.271
1,AC_RBC,-9.996,-67.802,136.304,2002.927,-6.048,-7.167,3.676,3.667,25681.0,...,0.56,0.291,86.356,86.354,94.392,94.667,259.678,259.546,141.101,141.442
2,AM_BAR,-0.98,-62.922,31.832,2005.861,-12.522,-12.567,16.575,16.583,28071.0,...,0.163,0.222,130.192,72.745,7435.109,72.709,91.531,83.793,184.312,168.085
3,AM_BJC,-4.367,-70.05,73.16,1984.706,-3.858,-3.483,16.61,16.567,28921.0,...,0.245,0.263,62.099,42.024,43.905,43.974,119.56,119.754,149.236,149.702
4,AM_CRA,-4.878,-66.895,75.256,1995.906,-7.881,-7.7,13.293,13.617,27632.0,...,0.222,0.236,31.757,44.168,41.469,41.376,106.824,107.543,110.291,109.858
5,AM_MAN,-2.93,-59.975,90.7,2005.76,-14.324,-14.3,11.314,11.133,27065.0,...,0.327,0.208,128.662,133.015,4074.033,97.073,171.983,168.497,143.647,141.949
6,AM_MNC_A,-5.795,-61.278,41.728,2005.908,-12.694,-13.2,6.885,7.033,25926.0,...,0.506,0.148,191.7,182.2,22128.1,126.184,,264.399,,89.371
7,AM_SGC,-0.115,-66.992,94.148,2005.878,-10.434,-9.883,19.019,20.267,28735.0,...,0.592,0.677,412.539,384.585,7568.77,393.817,281.119,281.445,277.701,371.984
8,AP_MCP,0.052,-51.068,12.899,2009.536,-19.242,-18.983,6.739,6.55,27225.0,...,0.255,0.272,86.898,98.288,9125.755,84.635,133.252,131.661,136.266,130.428
9,AP_OPQ,3.861,-51.796,32.563,2009.527,-18.518,-18.417,13.81,14.067,28598.0,...,0.104,0.24,50.522,28.261,9762.055,33.715,48.235,47.904,127.523,128.145


In [11]:
table_df.to_csv(output_folder / selected_database_table_ref, index=False, float_format="%.3f", na_rep="NaN")