# Dataset Import

In [17]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import warnings

In [18]:
de_train = pd.read_csv("de_train_DAY.csv", index_col=0)
de_train

Unnamed: 0,ID,DAY_ID,DE_CONSUMPTION,DE_FR_EXCHANGE,DE_NET_EXPORT,DE_NET_IMPORT,DE_GAS,DE_COAL,DE_HYDRO,DE_NUCLEAR,...,DE_WINDPOW,DE_LIGNITE,DE_RESIDUAL_LOAD,DE_RAIN,DE_WIND,DE_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET
851,111,2,-0.068972,-1.102015,-1.080403,1.080403,1.228079,-0.247704,1.785758,0.064726,...,-0.421844,-0.471366,0.398812,-0.344600,-0.623041,-0.148950,1.802550,1.140920,0.900434,0.861270
852,800,3,-0.134670,-1.051716,-1.881881,1.881881,1.588542,-0.635452,1.994144,-2.002323,...,-1.142488,-0.169336,0.275487,-1.394561,0.244859,-1.710888,0.440121,-0.064550,-0.032756,7.138604
853,831,5,-0.297850,0.144615,-1.208286,1.208286,1.059828,-0.072071,1.275857,-1.875681,...,-0.892244,-0.279690,-0.031595,-1.374757,0.333817,-0.062187,-0.117977,0.550433,0.781870,0.026374
854,779,7,0.057599,0.002239,-0.676226,0.676226,0.386191,0.255380,0.281094,-1.887303,...,-0.876909,0.108801,0.142427,-0.531862,0.098940,-0.812164,-0.379980,0.518459,-0.034642,-0.021227
855,841,8,-0.282744,1.309253,0.248085,-0.248085,0.955400,-0.234473,0.385716,-1.889094,...,0.274719,-0.245104,-0.508221,1.522720,1.690497,-0.441943,1.557325,0.146634,2.329924,0.042996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489,282,1202,-1.182561,-1.201004,-1.616737,1.616737,-0.052984,-1.334403,0.109583,-1.312022,...,0.116425,-2.701475,-1.033438,-0.085663,-0.595651,0.039450,0.184767,0.257420,0.507079,-0.091774
1490,505,1207,1.002463,0.651648,0.728817,-0.728817,0.321619,-1.061328,0.086429,-0.572800,...,1.799559,-1.963002,-1.238062,-0.222964,0.371728,0.355241,0.099209,1.750872,0.646905,-1.324555
1491,823,1208,0.046568,0.427976,-0.578391,0.578391,1.391369,0.069929,1.620765,-1.891245,...,-0.572220,0.141485,0.151799,-1.224735,0.472212,1.065727,0.291714,0.568479,0.395742,0.060380
1492,771,1212,-0.046100,-0.270515,-0.137917,0.137917,0.231723,-0.491812,-0.441138,-1.885215,...,-0.245558,-0.159332,-0.623925,-0.039379,1.358211,-0.707839,0.017778,0.072168,-0.160792,-0.144320


In [19]:
de_train['NEW_ENERGY'] = de_train['DE_HYDRO'] + de_train['DE_NUCLEAR'] + de_train['DE_WINDPOW']  + de_train['DE_SOLAR'] + de_train['DE_LIGNITE']+de_train['DE_RESIDUAL_LOAD']
de_train['OLD_ENERGY'] = de_train['DE_GAS'] + de_train['DE_COAL']
de_train['WEATHER'] = de_train['DE_RAIN'] + de_train['DE_WIND']+ de_train['DE_TEMP']

In [20]:
de_train['WEEKDAY'] = de_train['DAY_ID'] % 7 + 1 

In [21]:
de_X = de_train.drop(['TARGET', 'ID', 'DAY_ID','DE_NET_EXPORT'], axis=1)
de_y = de_train['TARGET']
de_X.columns

Index(['DE_CONSUMPTION', 'DE_FR_EXCHANGE', 'DE_NET_IMPORT', 'DE_GAS',
       'DE_COAL', 'DE_HYDRO', 'DE_NUCLEAR', 'DE_SOLAR', 'DE_WINDPOW',
       'DE_LIGNITE', 'DE_RESIDUAL_LOAD', 'DE_RAIN', 'DE_WIND', 'DE_TEMP',
       'GAS_RET', 'COAL_RET', 'CARBON_RET', 'NEW_ENERGY', 'OLD_ENERGY',
       'WEATHER', 'WEEKDAY'],
      dtype='object')

In [22]:
from sklearn.model_selection import train_test_split

de_X_train, de_X_valid, de_y_train, de_y_valid = train_test_split(de_X, de_y, test_size=0.2, random_state=42)

print("train dataset:", de_X_train.shape, de_y_train.shape)
print("validation dataset:", de_X_valid.shape, de_y_valid.shape)

train dataset: (514, 21) (514,)
validation dataset: (129, 21) (129,)


# Tuning

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svr_model = SVR()

svr_model.fit(de_X_train, de_y_train)

de_y_pred_svr = svr_model.predict(de_X_valid)

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer


def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation
spearman_scorer = make_scorer(spearman_corr)

param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1],
}


grid_search = GridSearchCV(svr_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(de_X_train, de_y_train)

print("Best parameters:", grid_search.best_params_)

best_svr = grid_search.best_estimator_
de_y_pred_svr = best_svr.predict(de_X_valid)
mse = mean_squared_error(de_y_valid, de_y_pred_svr)
print("Mean Squared Error:", mse)


Best parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Mean Squared Error: 1.1233387660170195


In [32]:
from scipy.stats import spearmanr

best_model =grid_search.best_estimator_
validation_score = spearman_corr(de_y_valid, best_model.predict(de_X_valid))

print("Validation Score with Best Parameters:", validation_score)

Validation Score with Best Parameters: 0.4227470930232559


### After adjust several parameters by hand, {'kernel='linear', C=100.0, epsilon=0.499'} has the best performance

In [35]:
from sklearn.svm import SVR
from scipy.stats import spearmanr

svr_model = SVR(kernel='linear', C=100.0, epsilon=0.499)  
print(svr_model)

svr_model.fit(de_X_train, de_y_train)

fr_y_pred_svr = svr_model.predict(de_X_valid)

spearman_corr_svr, _ = spearmanr(de_y_valid, fr_y_pred_svr)
print("sc:", spearman_corr_svr)

mse_svr = mean_squared_error(de_y_valid, fr_y_pred_svr)
print("mse:", mse_svr)

SVR(C=100.0, epsilon=0.499, kernel='linear')
sc: 0.45473501788908777
mse: 1.087742796741826
