# Dataset Import

In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import warnings

In [3]:
fr_train = pd.read_csv("fr_train_outlier.csv", index_col=0)
fr_train

Unnamed: 0,ID,DAY_ID,FR_CONSUMPTION,FR_DE_EXCHANGE,FR_NET_EXPORT,FR_NET_IMPORT,FR_GAS,FR_COAL,FR_HYDRO,FR_NUCLEAR,FR_SOLAR,FR_WINDPOW,FR_RESIDUAL_LOAD,FR_RAIN,FR_WIND,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET
1,1179,1,1.222131,-0.331356,0.778627,-0.778627,1.991028,-0.786509,0.709616,1.381575,0.485975,-0.172140,1.214288,-0.497520,-1.465608,0.231602,1.480313,0.931562,0.822047,-0.063369
2,1327,2,-0.667390,1.102015,0.256736,-0.256736,0.458302,-0.766904,-0.930172,-0.379230,1.032412,-0.844350,-0.540642,-0.372156,-0.926064,0.641235,1.802550,1.140920,0.900434,2.575976
3,2016,3,-0.834564,1.051716,-0.612133,0.612133,0.069297,-0.718729,-0.383690,-1.579208,2.986527,-0.718643,-0.856321,-1.118297,0.488650,-0.951057,0.440121,-0.064550,-0.032756,0.068905
4,2047,5,-0.470371,-0.144615,-1.811403,1.811403,0.528273,-0.766063,-0.398178,-1.866010,3.425813,-0.640389,-0.552878,-0.790071,0.021868,1.459745,-0.117977,0.550433,0.781870,1.031308
5,1995,7,-0.625625,-0.002239,-0.745182,0.745182,0.727314,-0.778036,-0.739291,-1.934168,2.276123,-0.079343,-0.806379,-0.663419,1.367421,0.954384,-0.379980,0.518459,-0.034642,-0.118915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,1120,1205,-0.100235,-0.381612,0.106216,-0.106216,0.137448,0.578370,-0.191697,0.132869,0.127692,-0.389945,-0.019808,-0.435577,-0.667096,-0.481947,0.047390,0.562084,-0.954402,-0.028575
845,1721,1207,0.516789,-0.651648,-0.820640,0.820640,0.295393,-0.783923,-0.138441,-0.523101,1.539418,0.839589,0.236243,1.112782,-0.018442,-0.960526,0.099209,1.750872,0.646905,-0.729755
846,2039,1208,-0.709011,-0.427976,-1.678101,1.678101,0.179728,-0.758579,-0.380974,-1.903612,1.061313,-0.468104,-0.666261,2.661142,0.630211,-1.439105,0.291714,0.568479,0.395742,0.136028
848,1987,1212,-0.520506,0.270515,-0.789824,0.789824,0.625656,-0.776785,-0.906285,-1.434474,1.589641,-0.266687,-0.589767,-0.237434,1.123953,-0.308232,0.017778,0.072168,-0.160792,-0.425474


In [5]:
fr_train['CLEAN_ENERGY'] = fr_train['FR_HYDRO'] + fr_train['FR_NUCLEAR'] + fr_train['FR_WINDPOW'] + fr_train['FR_SOLAR']
fr_train['FOSSIL_ENERGY'] = fr_train['FR_GAS'] + fr_train['FR_COAL']

In [6]:
fr_train['WEEKDAY'] = fr_train['DAY_ID'] % 7 + 1 

In [7]:
fr_X = fr_train.drop(['TARGET', 'FR_NET_EXPORT', 'ID', 'DAY_ID'], axis=1)
fr_y = fr_train['TARGET']
fr_X.columns

Index(['FR_CONSUMPTION', 'FR_DE_EXCHANGE', 'FR_NET_IMPORT', 'FR_GAS',
       'FR_COAL', 'FR_HYDRO', 'FR_NUCLEAR', 'FR_SOLAR', 'FR_WINDPOW',
       'FR_RESIDUAL_LOAD', 'FR_RAIN', 'FR_WIND', 'FR_TEMP', 'GAS_RET',
       'COAL_RET', 'CARBON_RET', 'CLEAN_ENERGY', 'FOSSIL_ENERGY', 'WEEKDAY'],
      dtype='object')

In [8]:
from sklearn.model_selection import train_test_split

fr_X_train, fr_X_valid, fr_y_train, fr_y_valid = train_test_split(fr_X, fr_y, test_size=0.2, random_state=42)

print("train dataset:", fr_X_train.shape, fr_y_train.shape)
print("validation dataset:", fr_X_valid.shape, fr_y_valid.shape)

train dataset: (564, 19) (564,)
validation dataset: (141, 19) (141,)


# Tuning

In [9]:
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import spearmanr

knn_model = KNeighborsRegressor(n_neighbors=5)  
print(knn_model)

knn_model.fit(fr_X_train, fr_y_train)

fr_y_pred_knn = knn_model.predict(fr_X_valid)

KNeighborsRegressor()


In [10]:
from sklearn.model_selection import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer

# scorer
def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation
spearman_scorer = make_scorer(spearman_corr)

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # k
    'weights': ['uniform', 'distance'],  
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # search algorithm
    'leaf_size': [10, 20, 30, 40, 50],  # leaf size of a ball tree
    'p': [1, 2],  # (1 Manhattan distance 2 Euclidean distance)
    'metric': ['minkowski', 'euclidean', 'manhattan']  # distance measure
}

# GridSearch
crf = GridSearchCV(knn_model, param_grid=param_grid, cv=5, scoring=spearman_scorer, n_jobs=-1)
crf.fit(fr_X_train, fr_y_train)

print("Best Parameters:", crf.best_params_)
print("Best Score:", crf.best_score_)


Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best Score: 0.17870535476989163


In [11]:
best_model = crf.best_estimator_
validation_score = spearman_corr(fr_y_valid, best_model.predict(fr_X_valid))

print("Validation Score with Best Parameters:", validation_score)

Validation Score with Best Parameters: 0.1533273256206468
