In [1]:
# Mandatory operation for importing custom libraries
from pathlib import Path
import sys

cwd = Path.cwd()
base = cwd.parent
sys.path.append(str(base))

In [2]:
# Testing FileManager
from file_manager.file_manager import FileManager

file_manager = FileManager()
print(file_manager.data_folder)

c:\Users\User\Desktop\Hi!ckathon\hickathon5\data


In [3]:
import pandas as pd
import warnings

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings("ignore")

# Load data

In [4]:
train_data = file_manager.load_training_data()

In [5]:
print(train_data.iloc[0])

piezo_station_department_code                                                          01
piezo_station_update_date                                   Sun Jul 14 13:00:02 CEST 2024
piezo_station_investigation_depth                                                    20.0
piezo_station_department_name                                                         Ain
piezo_station_commune_code_insee                                                    01073
piezo_station_pe_label                  PIEZOMETRE - MARAIS DE LAVOURS (CEYZERIEU - BR...
piezo_station_bdlisa_codes                                                    ['712AH37']
piezo_station_altitude                                                              232.0
piezo_station_bss_code                                                   07004X0046/D6-20
piezo_station_commune_name                                                      Ceyzérieu
piezo_station_longitude                                                          5.748241
piezo_stat

# Checking if dates are coherent

## Formating relevent columns into DateTime type

Columns: `piezo_measurement_date`, `meteo_date`, `hydro_observation_date_elab`

In [6]:
print(train_data.iloc[0]['meteo_id'])
print(type(train_data.iloc[0]['meteo_id']))

1034004
<class 'numpy.int64'>


In [7]:
print(train_data.iloc[0]['piezo_measurement_date'])
print(type(train_data.iloc[0]['piezo_measurement_date']))
train_data['piezo_measurement_date'] = pd.to_datetime(train_data['piezo_measurement_date'])
print(train_data['piezo_measurement_date'].info())
instance_piezo = train_data['piezo_measurement_date'].iloc[0]
print(instance_piezo)
print(type(instance_piezo))
print(instance_piezo.date())

2020-01-01
<class 'str'>
<class 'pandas.core.series.Series'>
Index: 2830316 entries, 0 to 3294084
Series name: piezo_measurement_date
Non-Null Count    Dtype         
--------------    -----         
2830316 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 43.2 MB
None
2020-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2020-01-01


In [8]:
# Last row
instance_piezo_tail = train_data['piezo_measurement_date'].iloc[-1]
print(instance_piezo_tail)
print(type(instance_piezo_tail))
print(instance_piezo_tail.date())

2023-05-31 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-05-31


> The years span from 2020 to 2023.

In [9]:
print(train_data.iloc[0]['meteo_date'])
print(type(train_data.iloc[0]['meteo_date']))
train_data['meteo_date'] = pd.to_datetime(train_data['meteo_date'])
instance_meteo = train_data['meteo_date'].iloc[0]
print(instance_meteo)
print(type(instance_meteo))

2020-01-01
<class 'str'>
2020-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [10]:
print(train_data.iloc[0]['hydro_observation_date_elab'])
print(type(train_data.iloc[0]['hydro_observation_date_elab']))
train_data['hydro_observation_date_elab'] = pd.to_datetime(train_data['hydro_observation_date_elab'])
instance_hydro = train_data['hydro_observation_date_elab'].iloc[0]
print(instance_hydro)
print(type(instance_hydro))

2020-01-01
<class 'str'>
2020-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [11]:
# Comparing two dates
print(instance_piezo.date() == instance_meteo.date())

True


## Actual checking

In [12]:
def get_unique_dates(datetimes: list[pd.Timestamp]) -> bool:
    date = datetimes[0].date()
    unique_dates = [date]
    for datetime in datetimes:
        if datetime.date() != date:
            unique_dates.append(datetime.date())
    return unique_dates

In [13]:
for index, row in train_data.iterrows():
    unique_dates = get_unique_dates([row['piezo_measurement_date'], row['meteo_date'], row['hydro_observation_date_elab']])
    if len(unique_dates) > 1:
        print(f"Multiple dates at row {index}: {unique_dates}")
print("Done")

Done


## > No incoherent dates in train set

# Splitting data

In [22]:
import os

# Splitting by year, starting with Winter 2020 (01-01-2020) until Spring 2023 (31-05-2023)
def split(
        df: pd.DataFrame,
        start_date: str,
        end_date: str,
        output_file_name: str
) -> None:
    # Making sure that column is DateTime
    df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])
    # Splitting
    split_df = df.loc[(df['piezo_measurement_date'] >= start_date) & (df['piezo_measurement_date'] <= end_date)]
    # Save file
    if not os.path.exists(file_manager.data_folder / output_file_name):
        file_manager.save_dataframe(split_df, output_file_name)
    else:
        print("File already exists.")

In [15]:
# Export Winter 2020
start_date = '2019-12-01'
end_date = '2020-02-29'
output_file_name = 'X_train_winter_2020.csv'

split(train_data, start_date, end_date, output_file_name)

In [20]:
# Export 2020
start_date = '2019-12-01'
end_date = '2020-11-30'
output_file_name = 'X_train_2020.csv'

split(train_data, start_date, end_date, output_file_name)

In [21]:
# Export 2021
start_date = '2020-12-01'
end_date = '2021-11-30'
output_file_name = 'X_train_2021.csv'

split(train_data, start_date, end_date, output_file_name)

In [23]:
# Export 2022
start_date = '2021-12-01'
end_date = '2022-11-30'
output_file_name = 'X_train_2022.csv'

split(train_data, start_date, end_date, output_file_name)

In [24]:
# Export 2023
start_date = '2022-12-01'
end_date = '2023-11-30'
output_file_name = 'X_train_2023.csv'

split(train_data, start_date, end_date, output_file_name)

# Analysis of the test set

In [16]:
test_data = file_manager.load_test_data()

test_data.iloc[0]

piezo_station_department_code                                                          01
piezo_station_update_date                                   Sun Jul 14 13:00:02 CEST 2024
piezo_station_investigation_depth                                                    20.0
piezo_station_department_name                                                         Ain
piezo_station_commune_code_insee                                                    01073
piezo_station_pe_label                  PIEZOMETRE - MARAIS DE LAVOURS (CEYZERIEU - BR...
piezo_station_bdlisa_codes                                                    ['712AH37']
piezo_station_altitude                                                              232.0
piezo_station_bss_code                                                   07004X0046/D6-20
piezo_station_commune_name                                                      Ceyzérieu
piezo_station_longitude                                                          5.748241
piezo_stat

In [17]:
test_data.iloc[-1]

piezo_station_department_code                                                          95
piezo_station_update_date                                    Tue Jan 28 19:01:33 CET 2020
piezo_station_investigation_depth                                                   104.0
piezo_station_department_name                                                  Val-d'Oise
piezo_station_commune_code_insee                                                    95509
piezo_station_pe_label                  Piézomètre de Puiseux-en-France (Puiseux-en-Fr...
piezo_station_bdlisa_codes                                         ['113AQ27', '113AV01']
piezo_station_altitude                                                              131.0
piezo_station_bss_code                                                       01534X0049/F
piezo_station_commune_name                                              Puiseux-en-France
piezo_station_longitude                                                          2.488424
piezo_stat

In [18]:
def data_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    variable_names = ['piezo_measurement_date', 'meteo_date', 'hydro_observation_date_elab']
    for name in variable_names:
        df[name] = pd.to_datetime(df[name])
    return df

In [19]:
test_data = data_to_datetime(test_data)

for index, row in test_data.iterrows():
    unique_dates = get_unique_dates([row['piezo_measurement_date'], row['meteo_date'], row['hydro_observation_date_elab']])
    if len(unique_dates) > 1:
        print(f"Multiple dates at row {index}: {unique_dates}")
print("Done")

Done


## > No incoherent dates in test set