# Comparison between the original values of the repeat stations, the calculated ones and the IGRF

## Description

This program does the following:
- It calculates the RMSE for both datasets in relation to IGRF for all components

In [1]:
# Import modules
import mestrado_module as mm
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
# Defitions for input and output
input_folder: Path = Path(mm.path_pipeline_04_igrf_calc)
input_file: Path = Path(mm.output_4d_code_rs_igrf_database)

# Station list
input_list_stations_folder: Path = Path(mm.path_pipeline_03_rs_database_creation)
input_list_stations_file: Path = Path(mm.output_3_code_ocp_list)

# Output
output_folder: Path = Path(mm.path_pipeline_04_igrf_calc)
rmse_file: Path = Path(mm.output_4d_code_error_database)
final_database: Path = Path(mm.output_4e_code_complete_rs_igrf_database)

## Read data

In [3]:
# Load station and IGRF data with Pandas
df = pd.read_csv(input_folder / input_file)
df

Unnamed: 0,Code,Lat_dd,Lon_dd,Alt_m,Time_dy,D_dd,IGRF_D_dd,I_dd,IGRF_I_dd,F_nT,...,Z_nT,IGRF_Z_nT,Calculated_Z,N_occupations,Closest_OBS,TTB_distances_km,VSS_distances_km,State,Region,RS_name
0,AC_CZS,-7.637,-72.670,182.464,1958.529,2.683,2.783,11.281,11.167,29671.0,...,5804.0,5735.2,5804.269,6,TTB,2770.494,3511.151,AC,N,CRUZEIRO DO SUL
1,AC_CZS,-7.637,-72.670,182.464,1965.848,1.824,1.917,11.277,10.917,29227.0,...,5715.0,5519.5,5715.412,6,TTB,2770.494,3511.151,AC,N,CRUZEIRO DO SUL
2,AC_CZS,-7.620,-72.670,195.600,1978.640,-0.035,0.150,11.026,10.800,28359.0,...,5423.0,5309.1,5423.784,6,TTB,2770.044,3512.127,AC,N,CRUZEIRO DO SUL
3,AC_CZS,-7.599,-72.770,196.508,1986.279,-1.343,-1.083,10.685,10.550,27886.0,...,5170.0,5109.1,5170.325,6,TTB,2780.199,3522.772,AC,N,CRUZEIRO DO SUL
4,AC_CZS,-7.599,-72.770,196.508,1989.503,-1.847,-1.633,10.468,10.317,27683.0,...,5029.0,4965.3,5029.623,6,TTB,2780.199,3522.772,AC,N,CRUZEIRO DO SUL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,TO_PNL,-10.727,-48.408,240.299,1985.119,-18.142,-18.083,-7.084,-7.017,25130.0,...,-3099.0,-3070.7,-3099.141,9,TTB,1058.862,1393.116,TO,N,PORTO NACIONAL
1078,TO_PNL,-10.727,-48.408,240.299,1986.670,-18.325,-18.250,-7.556,-7.533,25083.0,...,-3298.0,-3282.4,-3298.293,9,TTB,1058.862,1393.116,TO,N,PORTO NACIONAL
1079,TO_PNL,-10.727,-48.408,240.299,1995.817,-19.315,-19.250,-10.562,-10.617,24778.0,...,-4541.0,-4551.7,-4541.793,9,TTB,1058.862,1393.116,TO,N,PORTO NACIONAL
1080,TO_PNL,-10.721,-48.401,256.835,2003.702,-20.117,-20.033,-13.008,-13.350,24526.0,...,-5520.0,-5656.0,-5520.486,9,TTB,1058.203,1393.470,TO,N,PORTO NACIONAL


In [4]:
# Read station code list (to know each station code and plot it)
st_list = pd.read_csv(input_list_stations_folder / input_list_stations_file)
st_list.info()

# Create a list with station codes
list1 = st_list["RS_code"]
ocp_list = st_list["N_occupations"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   RS_code        218 non-null    object
 1   N_occupations  218 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.5+ KB


## RMSE calculation: Original repeat station values and IGRF

In [5]:
# Create the arrays for rmse component 
rmse_orig_d = []
rmse_orig_i = []
rmse_orig_f = []
rmse_orig_h = []
rmse_orig_x = []
rmse_orig_y = []
rmse_orig_z = []

In [6]:
for i in list1:
    station_code = i
    aux_df = df[df["Code"] == station_code]
    
    # Calculate RMSE
    rmse_val_orig_d = mm.rmse(aux_df["IGRF_D_dd"], aux_df["D_dd"])
    rmse_orig_d.append(rmse_val_orig_d) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_orig_i = mm.rmse(aux_df["IGRF_I_dd"], aux_df["I_dd"])
    rmse_orig_i.append(rmse_val_orig_i) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_orig_f = mm.rmse(aux_df["IGRF_F_nT"], aux_df["F_nT"])
    rmse_orig_f.append(rmse_val_orig_f) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_orig_h = mm.rmse(aux_df["IGRF_H_nT"], aux_df["H_nT"])
    rmse_orig_h.append(rmse_val_orig_h) # append the result of each loop to the array

    # Calculate RMSE
    rmse_val_orig_x = mm.rmse(aux_df["IGRF_X_nT"], aux_df["X_nT"])
    rmse_orig_x.append(rmse_val_orig_x) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_orig_y = mm.rmse(aux_df["IGRF_Y_nT"], aux_df["Y_nT"])
    rmse_orig_y.append(rmse_val_orig_y) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_orig_z = mm.rmse(aux_df["IGRF_Z_nT"], aux_df["Z_nT"])
    rmse_orig_z.append(rmse_val_orig_z) # append the result of each loop to the array

## RMSE calculation: calculated repeat station values and IGRF

In [7]:
# Create the arrays for rmse component 
rmse_calc_x = []
rmse_calc_y = []
rmse_calc_z = []

In [8]:
for i in list1:
    station_code = i
    aux_df = df[df["Code"] == station_code]
    
    # Calculate RMSE
    rmse_val_calc_x = mm.rmse(aux_df["IGRF_X_nT"], aux_df["Calculated_X"])
    rmse_calc_x.append(rmse_val_calc_x) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_calc_y = mm.rmse(aux_df["IGRF_Y_nT"], aux_df["Calculated_Y"])
    rmse_calc_y.append(rmse_val_calc_y) # append the result of each loop to the array
    
    # Calculate RMSE
    rmse_val_calc_z = mm.rmse(aux_df["IGRF_Z_nT"], aux_df["Calculated_Z"])
    rmse_calc_z.append(rmse_val_calc_z) # append the result of each loop to the array

## Create dataframe for rmse values

In [9]:
rmse_data = {"Code": list1, "N_occupations": ocp_list,
            "RMSE_D_Original_values": rmse_orig_d, "RMSE_I_Original_values": rmse_orig_i, 
            "RMSE_F_Original_values": rmse_orig_f, "RMSE_H_Original_values": rmse_orig_h,
            "RMSE_X_Original_values": rmse_orig_x, "RMSE_X_Calculated_values": rmse_calc_x,
            "RMSE_Y_Original_values": rmse_orig_y, "RMSE_Y_Calculated_values": rmse_calc_y,
            "RMSE_Z_Original_values": rmse_orig_z, "RMSE_Z_Calculated_values": rmse_calc_z}

rmse_df = pd.DataFrame(data=rmse_data)

In [10]:
# Save df
rmse_df.to_csv(output_folder / rmse_file, index=False,  float_format="%.3f", na_rep="NaN")

In [11]:
rmse_df

Unnamed: 0,Code,N_occupations,RMSE_D_Original_values,RMSE_I_Original_values,RMSE_F_Original_values,RMSE_H_Original_values,RMSE_X_Original_values,RMSE_X_Calculated_values,RMSE_Y_Original_values,RMSE_Y_Calculated_values,RMSE_Z_Original_values,RMSE_Z_Calculated_values
0,AC_CZS,6,0.177516,0.203561,50.307985,36.257298,37.604632,37.679700,83.891785,84.130434,105.800606,106.271308
1,AC_EPC,2,0.064919,0.024166,78.274932,78.164090,80.021903,80.937389,19.965095,19.306849,8.364807,7.955991
2,AC_EPC_B,1,0.136000,1.312000,69.800000,69.000000,24291.700000,84.170000,,47.852000,,577.241000
3,AC_RBC,10,0.559982,0.291417,86.356314,86.353674,94.391610,94.667157,259.677999,259.545749,141.100804,141.442497
4,AL_MCO,9,0.096944,0.276068,45.029151,60.840694,10754.946081,57.374283,53.723866,52.397600,93.228215,113.125293
...,...,...,...,...,...,...,...,...,...,...,...,...
213,SP_SJC,8,0.060622,0.102059,38.687256,51.384446,15459.057472,51.131385,20.969144,21.191983,24.980259,25.256668
214,SP_SPO,8,0.174034,0.219800,19.917831,30.221267,36.663572,37.102613,65.320958,65.198164,88.852005,88.668589
215,SP_TAU,5,0.209223,0.081940,116.909290,100.131673,110.001282,109.525219,70.854146,71.323299,69.970880,69.680402
216,TO_PNL,9,0.113606,0.186383,68.931867,71.877782,59.999407,60.317905,64.367115,64.795929,81.497157,81.245395


## Create the final database: Repeat station, IGRF and RMSE values

In [12]:
df_final = df
# Add the columns
df_final["RMSE_D_Original_values"] = 0
df_final["RMSE_I_Original_values"] = 0
df_final["RMSE_F_Original_values"] = 0
df_final["RMSE_H_Original_values"] = 0
df_final["RMSE_X_Original_values"] = 0
df_final["RMSE_X_Calculated_values"] = 0 
df_final["RMSE_Y_Original_values"] = 0
df_final["RMSE_Y_Calculated_values"] = 0
df_final["RMSE_Z_Original_values"] = 0
df_final["RMSE_Z_Calculated_values"] = 0


#df_final

In [13]:
# Create a loop to read the names rmse values to substitute the 0 value for the correct ones
for i in range(len(list1)):
    df_final.loc[df_final['Code'] == list1[i], "RMSE_D_Original_values"] = rmse_orig_d[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_I_Original_values"] = rmse_orig_i[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_F_Original_values"] = rmse_orig_f[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_H_Original_values"] = rmse_orig_h[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_X_Original_values"] = rmse_orig_x[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_X_Calculated_values"] = rmse_calc_x[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_Y_Original_values"] = rmse_orig_y[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_Y_Calculated_values"] = rmse_calc_y[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_Z_Original_values"] = rmse_orig_z[i]
    df_final.loc[df_final['Code'] == list1[i], "RMSE_Z_Calculated_values"] = rmse_calc_z[i]


In [14]:
df_final

Unnamed: 0,Code,Lat_dd,Lon_dd,Alt_m,Time_dy,D_dd,IGRF_D_dd,I_dd,IGRF_I_dd,F_nT,...,RMSE_D_Original_values,RMSE_I_Original_values,RMSE_F_Original_values,RMSE_H_Original_values,RMSE_X_Original_values,RMSE_X_Calculated_values,RMSE_Y_Original_values,RMSE_Y_Calculated_values,RMSE_Z_Original_values,RMSE_Z_Calculated_values
0,AC_CZS,-7.637,-72.670,182.464,1958.529,2.683,2.783,11.281,11.167,29671.0,...,0.177516,0.203561,50.307985,36.257298,37.604632,37.679700,83.891785,84.130434,105.800606,106.271308
1,AC_CZS,-7.637,-72.670,182.464,1965.848,1.824,1.917,11.277,10.917,29227.0,...,0.177516,0.203561,50.307985,36.257298,37.604632,37.679700,83.891785,84.130434,105.800606,106.271308
2,AC_CZS,-7.620,-72.670,195.600,1978.640,-0.035,0.150,11.026,10.800,28359.0,...,0.177516,0.203561,50.307985,36.257298,37.604632,37.679700,83.891785,84.130434,105.800606,106.271308
3,AC_CZS,-7.599,-72.770,196.508,1986.279,-1.343,-1.083,10.685,10.550,27886.0,...,0.177516,0.203561,50.307985,36.257298,37.604632,37.679700,83.891785,84.130434,105.800606,106.271308
4,AC_CZS,-7.599,-72.770,196.508,1989.503,-1.847,-1.633,10.468,10.317,27683.0,...,0.177516,0.203561,50.307985,36.257298,37.604632,37.679700,83.891785,84.130434,105.800606,106.271308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,TO_PNL,-10.727,-48.408,240.299,1985.119,-18.142,-18.083,-7.084,-7.017,25130.0,...,0.113606,0.186383,68.931867,71.877782,59.999407,60.317905,64.367115,64.795929,81.497157,81.245395
1078,TO_PNL,-10.727,-48.408,240.299,1986.670,-18.325,-18.250,-7.556,-7.533,25083.0,...,0.113606,0.186383,68.931867,71.877782,59.999407,60.317905,64.367115,64.795929,81.497157,81.245395
1079,TO_PNL,-10.727,-48.408,240.299,1995.817,-19.315,-19.250,-10.562,-10.617,24778.0,...,0.113606,0.186383,68.931867,71.877782,59.999407,60.317905,64.367115,64.795929,81.497157,81.245395
1080,TO_PNL,-10.721,-48.401,256.835,2003.702,-20.117,-20.033,-13.008,-13.350,24526.0,...,0.113606,0.186383,68.931867,71.877782,59.999407,60.317905,64.367115,64.795929,81.497157,81.245395


In [15]:
df_final.to_csv(output_folder / final_database, index=False,  float_format="%.3f", na_rep="NaN")