In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
import joblib

In [19]:
df_Kn_X = pd.read_csv('../raw_data/Kn_data_preproc.csv', index_col = 0)
df_Dp_X = pd.read_csv('../raw_data/Dp_data_preproc.csv', index_col = 0)
df_Oo_X = pd.read_csv('../raw_data/Oo_data_preproc.csv', index_col = 0)

In [20]:
df_Kn_y = pd.read_csv('../raw_data/Kn_data_y.csv', index_col = 'Timestamp')
df_Dp_y = pd.read_csv('../raw_data/Dp_data_y.csv', index_col = 'Timestamp')
df_Oo_y = pd.read_csv('../raw_data/Oo_data_y.csv', index_col = 'Timestamp')

In [21]:
X_train_Kn , X_test_Kn, y_train_Kn, y_test_Kn = train_test_split(df_Kn_X, df_Kn_y, test_size = 0.3)
X_train_Dp , X_test_Dp, y_train_Dp, y_test_Dp = train_test_split(df_Dp_X, df_Dp_y, test_size = 0.3)
X_train_Oo , X_test_Oo, y_train_Oo, y_test_Oo = train_test_split(df_Oo_X, df_Oo_y, test_size = 0.3)

# Knn

In [22]:
#Kn
Kn_knn = KNeighborsRegressor()

grid = {'n_neighbors': [5, 10, 15, 20, 25, 50],
        'p':[1,2], 
        'weights':['uniform', 'distance']}

Kn_search_knn = GridSearchCV(Kn_knn, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

KNN_fitted_Kn = Kn_search_knn.fit(X_train_Kn, y_train_Kn)
Kn_knn_score = Kn_search_knn.best_score_
print('Best score is:', Kn_knn_score)
print('Best parameters are:', Kn_search_knn.best_params_)
print('Best estimator is:', Kn_search_knn.best_estimator_)

Best score is: 0.7844409269656125
Best parameters are: {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
Best estimator is: KNeighborsRegressor(n_neighbors=25, p=1, weights='distance')


In [23]:
#Dp
Dp_knn = KNeighborsRegressor()

grid = {'n_neighbors': [5, 10, 15, 20, 25, 50],
        'p':[1,2], 
        'weights':['uniform', 'distance']}

Dp_search_knn = GridSearchCV(Dp_knn, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

KNN_fitted_Dp = Dp_search_knn.fit(X_train_Dp, y_train_Dp)
Dp_knn_score = Dp_search_knn.best_score_
print('Best score is:', Dp_knn_score)
print('Best parameters are:', Dp_search_knn.best_params_)
print('Best estimator is:', Dp_search_knn.best_estimator_)

Best score is: 0.8267984942265603
Best parameters are: {'n_neighbors': 25, 'p': 1, 'weights': 'uniform'}
Best estimator is: KNeighborsRegressor(n_neighbors=25, p=1)


In [24]:
#Oo
Oo_knn = KNeighborsRegressor()

grid = {'n_neighbors': [5, 10, 15, 20, 25, 50],
        'p':[1,2], 
        'weights':['uniform', 'distance']}

Oo_search_knn = GridSearchCV(Oo_knn, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

KNN_fitted_Oo = Oo_search_knn.fit(X_train_Oo, y_train_Oo)
Oo_knn_score = Oo_search_knn.best_score_
print('Best score is:', Oo_knn_score)
print('Best parameters are:', Oo_search_knn.best_params_)
print('Best estimator is:', Oo_search_knn.best_estimator_)

Best score is: 0.799812082580501
Best parameters are: {'n_neighbors': 50, 'p': 1, 'weights': 'uniform'}
Best estimator is: KNeighborsRegressor(n_neighbors=50, p=1)


# XGBoost

In [25]:
#Kn
Kn_xgb = XGBRegressor()

grid = {'max_depth': [5, 10, 15],
        'n_estimators':[15, 20, 25, 50], 
        'learning_rate':[0.01, 0.1]}

Kn_search_xgb = GridSearchCV(Kn_xgb, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

XGB_fitted_Kn = Kn_search_xgb.fit(X_train_Kn, y_train_Kn)
XGB_fitted_Kn.score(X_test_Kn,y_test_Kn)

0.8036032778560642

In [26]:
#Dp
Dp_xgb = XGBRegressor()

grid = {'max_depth': [5, 10, 15],
        'n_estimators':[15, 20, 25, 50], 
        'learning_rate':[0.01, 0.1]}

Dp_search_xgb = GridSearchCV(Dp_xgb, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

XGB_fitted_Dp = Dp_search_xgb.fit(X_train_Dp, y_train_Dp)
XGB_fitted_Dp.score(X_test_Dp,y_test_Dp)

0.8213288917839418

In [27]:
#Oo
Oo_xgb = XGBRegressor()

grid = {'max_depth': [5, 10, 15],
        'n_estimators':[15, 20, 25, 50], 
        'learning_rate':[0.01, 0.1]}

Oo_search_xgb = GridSearchCV(Oo_xgb, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

XGB_fitted_Oo = Oo_search_xgb.fit(X_train_Oo, y_train_Oo)
XGB_fitted_Oo.score(X_test_Oo,y_test_Oo)

0.7996433252591678

# Desicion Tree

In [30]:
#Kn
Kn_tree = DecisionTreeRegressor()

cv_results = cross_validate(Kn_tree, X_train_Kn, y_train_Kn, scoring="r2", cv=5)
Kn_tree_score = cv_results['test_score'].mean()
Kn_tree.fit(X_train_Kn, y_train_Kn)
Kn_tree_score

0.5850844437885514

In [29]:
#Dp
Dp_tree = DecisionTreeRegressor()

cv_results = cross_validate(Dp_tree, X_train_Dp, y_train_Dp, scoring="r2", cv=5)
Dp_tree_score = cv_results['test_score'].mean()
Dp_tree.fit(X_train_Dp, y_train_Dp)
Dp_tree_score

0.6560779814207084

In [31]:
#Oo
Oo_tree = DecisionTreeRegressor()

cv_results = cross_validate(Oo_tree, X_train_Oo, y_train_Oo, scoring="r2", cv=5)
Oo_tree_score = cv_results['test_score'].mean()
Oo_tree.fit(X_train_Oo, y_train_Oo)
Oo_tree_score

0.5979052051002653

# Stacking

In [32]:
#Kn
Kn_ensemble = VotingRegressor(estimators=[('knn', Kn_search_knn), ('xgb', Kn_search_xgb), ('dt', Kn_tree)], weights=[.2,.6,.2])
Kn_ensemble_stacked = Kn_ensemble.fit(X_train_Kn, y_train_Kn)
Kn_ensemble_score = Kn_ensemble.score(X_test_Kn, y_test_Kn)
print(Kn_ensemble_score)

#Dp
Dp_ensemble = VotingRegressor(estimators=[('knn', Dp_search_knn), ('xgb', Dp_search_xgb), ('dt', Dp_tree)], weights=[.2,.6,.2])
Dp_ensemble_stacked = Dp_ensemble.fit(X_train_Dp, y_train_Dp)
Dp_ensemble_score = Dp_ensemble.score(X_test_Dp, y_test_Dp)
print(Dp_ensemble_score)

#Oo
Oo_ensemble = VotingRegressor(estimators=[('knn', Oo_search_knn), ('xgb', Oo_search_xgb), ('dt', Oo_tree)], weights=[.2,.6,.2])
Oo_ensemble_stacked = Oo_ensemble.fit(X_train_Oo, y_train_Oo)
Oo_ensemble_score = Oo_ensemble.score(X_test_Oo, y_test_Oo)
print(Oo_ensemble_score)

  y = column_or_1d(y, warn=True)


0.7994661756013022


  y = column_or_1d(y, warn=True)


0.8142846118160748


  y = column_or_1d(y, warn=True)


0.7960718970017368


In [33]:
joblib.dump(Kn_ensemble, '../raw_data/Kn_model.joblib')
joblib.dump(Dp_ensemble, '../raw_data/Dp_model.joblib')
joblib.dump(Oo_ensemble, '../raw_data/Oo_model.joblib')

['../raw_data/Oo_model.joblib']