# Rank the stations according to their number of occupartions: creating the number of occupations groups 

## Description

This program reads the Repeat station and IGRF database to do the following:
- It reads the data to separate the stations into groups of n occupations
- The groups are n12 (12 or more), n10 (10 to 11), n08 (8 yo 9), n06 (6 to 7), n03 (3 to 5) and n01 (1 to 2)
- It creates a database file for each occupation group
- It creates a database file with only the last occupation of each station in order to plot these stations using folium (one coordinate for each station)

Attention: three stations were removed from analysis due to being magnetic contaminated according to field reports from 2017 to 2020

In [1]:
# Import modules
import mestrado_module as mm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pathlib import Path
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [2]:
# Repeat station IGRF database info
rs_igrf_folder: Path = Path(mm.path_pipeline_04_igrf_calc)
rs_igrf_file: Path = Path(mm.output_4e_code_complete_rs_igrf_database)

# Save figures files
output_folder: Path = Path(mm.path_pipeline_05_rank_n_occupations)
groups_ocp_file: Path = Path(mm.output_5a_code_groups_ocp_df)
rs_df_file_n_12: Path = Path(mm.output_5a_code_database_n_12)
rs_df_file_n_10: Path = Path(mm.output_5a_code_database_n_10)
rs_df_file_n_08: Path = Path(mm.output_5a_code_database_n_08)
rs_df_file_n_06: Path = Path(mm.output_5a_code_database_n_06)
rs_df_file_n_03: Path = Path(mm.output_5a_code_database_n_03)
rs_df_file_n_01: Path = Path(mm.output_5a_code_database_n_01)

# Files for Folium plot
folium_file_n_12: Path = Path(mm.output_5a_code_folium_file_n_12)
folium_file_n_10: Path = Path(mm.output_5a_code_folium_file_n_10)
folium_file_n_08: Path = Path(mm.output_5a_code_folium_file_n_08)
folium_file_n_06: Path = Path(mm.output_5a_code_folium_file_n_06)
folium_file_n_03: Path = Path(mm.output_5a_code_folium_file_n_03)
folium_file_n_01: Path = Path(mm.output_5a_code_folium_file_n_01)

# Figure style
sns.set_style("darkgrid")

In [3]:
df = pd.read_csv(rs_igrf_folder / rs_igrf_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Code                      1082 non-null   object 
 1   Lat_dd                    1082 non-null   float64
 2   Lon_dd                    1082 non-null   float64
 3   Alt_m                     1082 non-null   float64
 4   Time_dy                   1082 non-null   float64
 5   D_dd                      1082 non-null   float64
 6   IGRF_D_dd                 1082 non-null   float64
 7   I_dd                      1082 non-null   float64
 8   IGRF_I_dd                 1082 non-null   float64
 9   F_nT                      1082 non-null   float64
 10  IGRF_F_nT                 1082 non-null   float64
 11  H_nT                      1082 non-null   float64
 12  IGRF_H_nT                 1082 non-null   float64
 13  X_nT                      1082 non-null   float64
 14  IGRF_X_n

## Removal of contaminated stations acording to field reports from 2017 to 2020

The following stations are going to be removed from further analysis due to being marked as magnetic polluted
- Jatai: GO_JAT
- Moraes de Almeida: PA_MAL
- Santana do Livramento: RS_LIV

In [4]:
# Deleting GO_JAT
df1 = df

# Get index for the rows with stations to be deleted
go_jat_index = df1[df1.Code == "GO_JAT"].index

# print the rows
print(go_jat_index)

df_go_jat_del = df1.drop(go_jat_index)
df_go_jat_del = df_go_jat_del.reset_index()

# drop the index column
df_go_jat_del = df_go_jat_del.drop(columns=["index"])

# view
df_go_jat_del.info()

Int64Index([302, 303, 304, 305, 306, 307, 308, 309], dtype='int64')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Code                      1074 non-null   object 
 1   Lat_dd                    1074 non-null   float64
 2   Lon_dd                    1074 non-null   float64
 3   Alt_m                     1074 non-null   float64
 4   Time_dy                   1074 non-null   float64
 5   D_dd                      1074 non-null   float64
 6   IGRF_D_dd                 1074 non-null   float64
 7   I_dd                      1074 non-null   float64
 8   IGRF_I_dd                 1074 non-null   float64
 9   F_nT                      1074 non-null   float64
 10  IGRF_F_nT                 1074 non-null   float64
 11  H_nT                      1074 non-null   float64
 12  IGRF_H_nT                 1074 non-null   float64


In [5]:
# Deleting PA_MAL
df2 = df_go_jat_del

# Get index for the rows with stations to be deleted
pa_mal_index = df2[df2.Code == "PA_MAL"].index

# print the rows
print(pa_mal_index)

df_pa_mal_del = df2.drop(pa_mal_index)
df_pa_mal_del = df_pa_mal_del.reset_index()

# drop the index column
df_pa_mal_del = df_pa_mal_del.drop(columns=["index"])

# view
df_pa_mal_del.info()

Int64Index([601], dtype='int64')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1073 entries, 0 to 1072
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Code                      1073 non-null   object 
 1   Lat_dd                    1073 non-null   float64
 2   Lon_dd                    1073 non-null   float64
 3   Alt_m                     1073 non-null   float64
 4   Time_dy                   1073 non-null   float64
 5   D_dd                      1073 non-null   float64
 6   IGRF_D_dd                 1073 non-null   float64
 7   I_dd                      1073 non-null   float64
 8   IGRF_I_dd                 1073 non-null   float64
 9   F_nT                      1073 non-null   float64
 10  IGRF_F_nT                 1073 non-null   float64
 11  H_nT                      1073 non-null   float64
 12  IGRF_H_nT                 1073 non-null   float64
 13  X_nT                      1073

In [6]:
# Deleting RS_LIV
df3 = df_pa_mal_del

# Get index for the rows with stations to be deleted
rs_liv_index = df3[df3.Code == "RS_LIV"].index

# print the rows
print(rs_liv_index)

df_rs_liv_del = df3.drop(rs_liv_index)
df_rs_liv_del = df_rs_liv_del.reset_index()

# drop the index column
df_rs_liv_del = df_rs_liv_del.drop(columns=["index"])

# view
df_rs_liv_del.info()

Int64Index([861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873,
            874, 875],
           dtype='int64')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Code                      1058 non-null   object 
 1   Lat_dd                    1058 non-null   float64
 2   Lon_dd                    1058 non-null   float64
 3   Alt_m                     1058 non-null   float64
 4   Time_dy                   1058 non-null   float64
 5   D_dd                      1058 non-null   float64
 6   IGRF_D_dd                 1058 non-null   float64
 7   I_dd                      1058 non-null   float64
 8   IGRF_I_dd                 1058 non-null   float64
 9   F_nT                      1058 non-null   float64
 10  IGRF_F_nT                 1058 non-null   float64
 11  H_nT                      1058 non-null   float

In [7]:
# Create new dataframe for work
aux_df = df_rs_liv_del

## Separate the repeat stations that have n or more occupations into groups to create their geodataframe to save them in files for later use

### Group 01: 12 or more occupations

In [8]:
# Create a dataframe with stations that have been occupied more than a value or equal to it: CUT OUT VALUE VARIABLE DEFINED AT THE BEGGINING
# Df with n or more occuaptions
n12 = 12
rs_ocp_high_n_12 = aux_df[aux_df["N_occupations"] >= n12]  
rs_ocp_high_n_12

# Check info
#rs_ocp_high_n_12.info()

# Save the file
rs_ocp_high_n_12.to_csv(output_folder / rs_df_file_n_12, index=False, float_format="%.3f")

### Group 02: between 10 and 11 occupations

In [9]:
# Create a dataframe with stations that have been occupied more than a value or equal to it
# Df with n or more occuaptions
n10 = 10
n11 = 11
rs_ocp_high_n_10 = aux_df[aux_df["N_occupations"].between(n10,n11)]
rs_ocp_high_n_10

# Check info
#rs_ocp_high_n_10.info()

# Save the file
rs_ocp_high_n_10.to_csv(output_folder / rs_df_file_n_10, index=False, float_format="%.3f")

### Group 03: between 8 and 9 occupations

In [10]:
# Create a dataframe with stations that have been occupied more than a value or equal to it
# Df with n or more occuaptions
n8 = 8
n9 = 9
rs_ocp_high_n_08 = aux_df[aux_df["N_occupations"].between(n8,n9)]
rs_ocp_high_n_08

# Check info
#rs_ocp_high_n_08.info()

# Save the file
rs_ocp_high_n_08.to_csv(output_folder / rs_df_file_n_08, index=False, float_format="%.3f")

### Group 04: between 6 and 7 occupations

In [11]:
# Create a dataframe with stations that have been occupied more than a value or equal to it: CUT OUT VALUE VARIABLE DEFINED AT THE BEGGINING
# Df with n or more occuaptions
n6 = 6
n7 = 7
rs_ocp_high_n_06 = aux_df[aux_df["N_occupations"].between(n6,n7)]
rs_ocp_high_n_06

# Check info
#rs_ocp_high_n_06.info()

# Save the file
rs_ocp_high_n_06.to_csv(output_folder / rs_df_file_n_06, index=False, float_format="%.3f")

### Group 06: between 3 to 5 occupations

In [12]:
# Create a dataframe with stations that have been occupied more than a value or equal to it: CUT OUT VALUE VARIABLE DEFINED AT THE BEGGINING
# Df with n or more occuaptions
n3 = 3
n5 = 5
rs_ocp_high_n_03 = aux_df[aux_df["N_occupations"].between(n3,n5)]
rs_ocp_high_n_03

# Check info
#rs_ocp_high_n_03.info()

# Save the file
rs_ocp_high_n_03.to_csv(output_folder / rs_df_file_n_03, index=False, float_format="%.3f")

### Group 06: between 1 and 2 occupations

In [13]:
# Create a dataframe with stations that have been occupied more than a value or equal to it
# Df with n or more occuaptions
n1 = 1
n2 = 2
rs_ocp_high_n_01 = aux_df[aux_df["N_occupations"].between(n1,n2)]
rs_ocp_high_n_01

# Check info
#rs_ocp_high_n_01.info()

# Save the file
rs_ocp_high_n_01.to_csv(output_folder / rs_df_file_n_01, index=False, float_format="%.3f")

## Dataframe with number of stations per group and only the last occupation present (to use with folium)

In [14]:
# Define the number of stations for each group
df_aux12 = rs_ocp_high_n_12.drop_duplicates(subset="Code", keep="last", inplace=False)
df_aux12.to_csv(output_folder / folium_file_n_12, index=False, float_format="%.3f")
n_12 = len(df_aux12)
print(n_12)

df_aux10 = rs_ocp_high_n_10.drop_duplicates(subset="Code", keep="last", inplace=False)
df_aux10.to_csv(output_folder / folium_file_n_10, index=False, float_format="%.3f")
n_10 = len(df_aux10)
print(n_10)

df_aux08 = rs_ocp_high_n_08.drop_duplicates(subset="Code", keep="last", inplace=False)
df_aux08.to_csv(output_folder / folium_file_n_08, index=False, float_format="%.3f")
n_08 = len(df_aux08)
print(n_08)
      
df_aux06 = rs_ocp_high_n_06.drop_duplicates(subset="Code", keep="last", inplace=False)
df_aux06.to_csv(output_folder / folium_file_n_06, index=False, float_format="%.3f")
n_06 = len(df_aux06)
print(n_06)

df_aux03 = rs_ocp_high_n_03.drop_duplicates(subset="Code", keep="last", inplace=False)
df_aux03.to_csv(output_folder / folium_file_n_03, index=False, float_format="%.3f")
n_03 = len(df_aux03)
print(n_03)

df_aux01 = rs_ocp_high_n_01.drop_duplicates(subset="Code", keep="last", inplace=False)
df_aux01.to_csv(output_folder / folium_file_n_01, index=False, float_format="%.3f")
n_01 = len(df_aux01)
print(n_01)


23
13
20
18
49
92


In [15]:
# Create the dataframe
data = {"Group": ["01", "02", "03", "04", "05", "06"], "Number of occupations": ["12 or more", "10 to 11", "8 to 9", "6 to 7", "3 to 5", "1 to 2"], "Number of stations": [n_12, n_10, n_08, n_06, n_03, n_01]}
group_ocp_df = pd.DataFrame(data = data) 

# View it
group_ocp_df.to_csv(output_folder / groups_ocp_file, index=False, float_format="%.3f")