In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler,FunctionTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import make_column_transformer
import pickle
import sys
sys.path.append('..')
from utils import trans_func, cos_list,sin_list
sys.path.pop()

'..'

In [2]:
df_Kn = pd.read_csv('../raw_data/Kn_data.csv', index_col = 'Timestamp')
df_Dp = pd.read_csv('../raw_data/Dp_data.csv', index_col = 'Timestamp')
df_Oo = pd.read_csv('../raw_data/Oo_data.csv', index_col = 'Timestamp')

In [3]:
df_Kn = df_Kn.dropna()
df_Dp = df_Dp.dropna()
df_Oo = df_Oo.dropna()

In [11]:
num_transformer = RobustScaler()
custom_tr = FunctionTransformer(trans_func)
cycle_tr_sin = FunctionTransformer(sin_list)
cycle_tr_cos = FunctionTransformer(cos_list)

preprocessor_tr = make_column_transformer(
    (num_transformer, ['wave_period', 'wind_speed']),
    (custom_tr, ['tide']),
    (cycle_tr_sin, ['wind_direction']),
    (cycle_tr_cos, ['wind_direction'])
    #remainder='passthrough'
    )   


pipe_preproc = Pipeline([
    ('preproc', preprocessor_tr),
    ('imputer', IterativeImputer())
])
pipe_preproc

In [12]:
df_Kn_model = pipe_preproc.fit(df_Kn)
df_Dp_model = pipe_preproc.fit(df_Dp)
df_Oo_model = pipe_preproc.fit(df_Oo)

In [13]:
with open("../raw_data/pipeline_Kn.pkl", "wb") as file:
    pickle.dump(df_Kn_model, file)
    
with open("../raw_data/pipeline_Dp.pkl", "wb") as file:
    pickle.dump(df_Dp_model, file)
    
with open("../raw_data/pipeline_Oo.pkl", "wb") as file:
    pickle.dump(df_Oo_model, file)

In [15]:
cols = ['wave_period', 'wind_speed', 'tide', 'wind_direction_sin', 'wind_direction_cos']
pd.DataFrame(df_Kn_model.transform(df_Kn), columns=cols).to_csv('../raw_data/Kn_data_preproc.csv')
pd.DataFrame(df_Dp_model.transform(df_Dp), columns=cols).to_csv('../raw_data/Dp_data_preproc.csv')
pd.DataFrame(df_Oo_model.transform(df_Oo), columns=cols).to_csv('../raw_data/Oo_data_preproc.csv')

In [17]:
df_Kn['wave_height'].to_csv('../raw_data/Kn_data_y.csv')
df_Dp['wave_height'].to_csv('../raw_data/Dp_data_y.csv')
df_Oo['wave_height'].to_csv('../raw_data/Oo_data_y.csv')