# QRT ENS Data Challenge 2023
This notebook details the benchmark construction for the challenge - it may also help participants to start the competition.

## Libraries

In [182]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings 
warnings.filterwarnings("ignore")
from sklearn import ensemble 
from sklearn.svm import SVC

## Loading data

- `X_train` and `X_test` both have $35$ columns that represent the same explanatory variables but over different time periods. 

- `X_train` and `Y_train` share the same column `ID` - each row corresponds to a unique ID associated wwith a day and a country. 

- The target of this challenge `TARGET` in `Y_train` corresponds to the price change for daily futures contracts of 24H electricity baseload. 

- **You will notice some columns have missing values**.


In [183]:
# After downloading the X_train/X_test/Y_train .csv files in your working directory:

X_train = pd.read_csv('X_train.csv')
Y_train = pd.read_csv('Y_train.csv')
X_test = pd.read_csv('X_test.csv')

In [184]:
# Load the data
X_train_clean = pd.read_csv('X_train.csv')
Y_train_clean = pd.read_csv('Y_train.csv')
X_test_clean = pd.read_csv('X_test.csv')


In [185]:
# Feature Engineering
# Example: Creating a new feature as the product of gas and coal prices
X_train['GAS_COAL_PRODUCT'] = X_train['GAS_RET'] * X_train['COAL_RET']
X_test['GAS_COAL_PRODUCT'] = X_test['GAS_RET'] * X_test['COAL_RET']


In [186]:
# Feature Engineering
# Example: Creating a new feature as the product of gas and coal prices
X_train_clean['GAS_COAL_PRODUCT'] = X_train_clean['GAS_RET'] * X_train_clean['COAL_RET']
X_test_clean['GAS_COAL_PRODUCT'] = X_test_clean['GAS_RET'] * X_test_clean['COAL_RET']


In [187]:
X_train_clean.drop_duplicates(subset=['DAY_ID'], inplace=True)
X_test_clean.drop_duplicates(subset=['DAY_ID'], inplace=True)

In [188]:
Y_train_clean = Y_train_clean[Y_train_clean["ID"].isin(X_train_clean["ID"])]

In [189]:
# Train-Test Split
X_train_clean = X_train_clean.drop(['COUNTRY', 'DAY_ID', 'ID'], axis=1)
X_test_clean = X_test_clean.drop(['COUNTRY', 'DAY_ID', 'ID'], axis=1)


In [190]:
Y_train_clean = Y_train_clean['TARGET']

In [191]:
for col in X_test_clean.columns:
    X_test_clean[str(col)].fillna(X_test_clean[str(col)].median(), inplace=True)

In [192]:
for col in X_train_clean.columns:
    X_train_clean[str(col)].fillna(X_train_clean[str(col)].median(), inplace=True)

In [193]:
X_train_split, X_val_split, Y_train_split, Y_val_split = train_test_split(X_train_clean, Y_train_clean, test_size=0.2, random_state=42)

## xgb

In [194]:
# XGBoost
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train_split, Y_train_split)             

In [195]:
# XGBoost   
output_train_xgb = xgb_model.predict(X_train_split)
output_val_xgb = xgb_model.predict(X_val_split)

correlation_train_xgb = spearmanr(output_train_xgb, Y_train_split).correlation
correlation_val_xgb = spearmanr(output_val_xgb, Y_val_split).correlation

print(f"Spearman's correlation for XGBoost on training set: {correlation_train_xgb}")
print(f"Spearman's correlation for XGBoost on validation set: {correlation_val_xgb}")


Spearman's correlation for XGBoost on training set: 0.9997772358893507
Spearman's correlation for XGBoost on validation set: 0.14114047087623302


In [196]:
# XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_child_weight': [1, 3, 5]
}

### grid search

In [197]:
# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [198]:
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=kf, scoring='neg_mean_squared_error')
grid_search_xgb.fit(X_train_clean, Y_train_clean)
best_grid_xgb_model = grid_search_xgb.best_estimator_

In [199]:
# XGBoost
output_train_best_xgb = best_grid_xgb_model.predict(X_train_split)
output_val_best_xgb = best_grid_xgb_model.predict(X_val_split)

correlation_train_best_xgb = spearmanr(output_train_best_xgb, Y_train_split).correlation
correlation_val_best_xgb = spearmanr(output_val_best_xgb, Y_val_split).correlation

print(f"Spearman's correlation for tuned (grid search) XGBoost on training set: {correlation_train_best_xgb}")
print(f"Spearman's correlation for tuned (grid search) XGBoost on validation set: {correlation_val_best_xgb}")

Spearman's correlation for tuned (grid search) XGBoost on training set: 0.4315286985032589
Spearman's correlation for tuned (grid search) XGBoost on validation set: 0.548065188516722


### randomised search kf

In [200]:
folds = 3
param_comb = 5

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [201]:
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_grid_xgb, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=kf.split(X_train_clean, Y_train_clean), verbose=3, random_state=1001 )
random_search_xgb.fit(X_train_clean, Y_train_clean)
best_random_xgb_model = random_search_xgb.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


  if is_sparse(dtype):
  if is_sparse(dtype):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = s

[CV 4/5] END learning_rate=0.01, max_depth=10, min_child_weight=3, n_estimators=200;, score=nan total time=   0.7s
[CV 3/5] END learning_rate=0.01, max_depth=10, min_child_weight=3, n_estimators=200;, score=nan total time=   0.7s
[CV 1/5] END learning_rate=0.01, max_depth=10, min_child_weight=3, n_estimators=200;, score=nan total time=   0.7s
[CV 2/5] END learning_rate=0.01, max_depth=10, min_child_weight=3, n_estimators=200;, score=nan total time=   0.7s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Traceback (most recent call last

[CV 1/5] END learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=200;, score=nan total time=   0.7s
[CV 3/5] END learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=200;, score=nan total time=   0.7s
[CV 2/5] END learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=200;, score=nan total time=   0.7s
[CV 5/5] END learning_rate=0.01, max_depth=10, min_child_weight=3, n_estimators=200;, score=nan total time=   0.7s
[CV 1/5] END learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50;, score=nan total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50;, score=nan total time=   0.1s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Traceback (most recent call last

[CV 3/5] END learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50;, score=nan total time=   0.1s
[CV 4/5] END learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50;, score=nan total time=   0.1s
[CV 5/5] END learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50;, score=nan total time=   0.1s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Traceback (most recent call last

[CV 1/5] END learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=200;, score=nan total time=   0.4s
[CV 4/5] END learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=200;, score=nan total time=   0.7s
[CV 2/5] END learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=200;, score=nan total time=   0.4s
[CV 5/5] END learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=200;, score=nan total time=   0.7s
[CV 1/5] END learning_rate=0.2, max_depth=3, min_child_weight=1, n_estimators=50;, score=nan total time=   0.1s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Traceback (most recent call last

[CV 2/5] END learning_rate=0.2, max_depth=3, min_child_weight=1, n_estimators=50;, score=nan total time=   0.1s
[CV 3/5] END learning_rate=0.2, max_depth=3, min_child_weight=1, n_estimators=50;, score=nan total time=   0.1s
[CV 4/5] END learning_rate=0.2, max_depth=3, min_child_weight=1, n_estimators=50;, score=nan total time=   0.1s
[CV 3/5] END learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=200;, score=nan total time=   0.4s
[CV 5/5] END learning_rate=0.2, max_depth=3, min_child_weight=1, n_estimators=50;, score=nan total time=   0.1s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 4/5] END learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=200;, score=nan total time=   0.4s
[CV 5/5] END learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=200;, score=nan total time=   0.4s


In [202]:
# XGBoost
output_train_best_random_xgb = best_random_xgb_model.predict(X_train_split)
output_val_best_random_xgb = best_random_xgb_model.predict(X_val_split)

correlation_train_best_random_xgb = spearmanr(output_train_best_random_xgb, Y_train_split).correlation
correlation_val_best_random_xgb = spearmanr(output_val_best_random_xgb, Y_val_split).correlation

print(f"Spearman's correlation for tuned using random search XGBoost on training set: {correlation_train_best_random_xgb}")
print(f"Spearman's correlation for tuned using random search XGBoost on validation set: {correlation_val_best_random_xgb}")

Spearman's correlation for tuned using random search XGBoost on training set: 0.9113116806516159
Spearman's correlation for tuned using random search XGBoost on validation set: 0.901139990880073


In [203]:
X_test = pd.read_csv('X_test.csv')
X_test_clean = pd.read_csv('X_test.csv')
X_test_clean['GAS_COAL_PRODUCT'] = X_test_clean['GAS_RET'] * X_test_clean['COAL_RET']
X_test_clean = X_test_clean.drop(['COUNTRY', 'DAY_ID', 'ID'], axis=1)
for col in X_test_clean.columns:
    X_test_clean[str(col)].fillna(X_test_clean[str(col)].median(), inplace=True)


In [204]:
Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = best_random_xgb_model.predict(X_test_clean)
Y_test_submission.to_csv('sharmin_xgb_medianfill_random.csv', index=False)

## random forest

In [158]:
# Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_split, Y_train_split)

In [159]:
output_train_rf = rf_model.predict(X_train_split)
output_val_rf = rf_model.predict(X_val_split)

correlation_train_rf = spearmanr(output_train_rf, Y_train_split).correlation
correlation_val_rf = spearmanr(output_val_rf, Y_val_split).correlation

print(f"Spearman's correlation for Random Forest on training set: {correlation_train_rf}")
print(f"Spearman's correlation for Random Forest on validation set: {correlation_val_rf}")

Spearman's correlation for Random Forest on training set: 0.8865450973290795
Spearman's correlation for Random Forest on validation set: 0.22849497204022368


In [160]:
# Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [161]:
# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

### grid search

In [162]:
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=kf, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train_clean, Y_train_clean)
best_rf_model = grid_search_rf.best_estimator_

In [163]:
# Random forest
output_train_best_rf = best_rf_model.predict(X_train_split)
output_val_best_rf = best_rf_model.predict(X_val_split)

correlation_train_best_rf = spearmanr(output_train_best_rf, Y_train_split).correlation
correlation_val_best_rf = spearmanr(output_val_best_rf, Y_val_split).correlation

print(f"Spearman's correlation for tuned GBR on training set: {correlation_train_best_rf}")
print(f"Spearman's correlation for tuned GBR on validation set: {correlation_val_best_rf}")

Spearman's correlation for tuned GBR on training set: 0.6839724141931814
Spearman's correlation for tuned GBR on validation set: 0.7041015671874625


### randomised search

In [164]:
folds = 3
param_comb = 5

In [165]:
random_search_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid_rf, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=kf.split(X_train_clean, Y_train_clean), verbose=3, random_state=1001 )

random_search_rf.fit(X_train_clean, Y_train_clean)

best_random_rf_model = random_search_rf.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 2/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   2.4s
[CV 3/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   2.4s
[CV 4/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   2.4s
[CV 1/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   2.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 2/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   2.5s
[CV 1/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   2.6s
[CV 3/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   2.5s
[CV 5/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   2.6s
[CV 2/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   0.7s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   0.8s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   0.8s
[CV 4/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   0.9s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 5/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   0.8s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 4/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   2.7s
[CV 5/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   2.8s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   3.2s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   3.1s
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   3.0s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   3.1s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 1/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   0.9s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 2/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   0.9s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 3/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   1.0s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 4/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   0.9s
[CV 5/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   0.9s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 5/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   3.2s


In [166]:
best_random_rf_model.fit(X_train_split, Y_train_split)
output_train_best_random_rf = best_random_rf_model.predict(X_train_split)
output_val_best_random_rf = best_random_rf_model.predict(X_val_split)

correlation_train_best_random_rf = spearmanr(output_train_best_random_rf, Y_train_split).correlation
correlation_val_best_random_rf = spearmanr(output_val_best_random_rf, Y_val_split).correlation

print(f"Spearman's correlation for tuned using random search RF on training set: {correlation_train_best_random_rf}")
print(f"Spearman's correlation for tuned using random search RF on validation set: {correlation_val_best_random_rf}")

Spearman's correlation for tuned using random search RF on training set: 0.8531483415484991
Spearman's correlation for tuned using random search RF on validation set: 0.24206446348429214


## gradient boosting regressor

In [167]:
params = {
    "n_estimators": 1000,
    "max_depth": 20,
    "min_samples_split": 5,
    "learning_rate": 0.001,
    "loss": "squared_error",
    }


In [168]:
# Gradient Boosting Regressor
gbr_model = ensemble.GradientBoostingRegressor(**params)
gbr_model.fit(X_train_split, Y_train_split)

In [169]:
output_train_gbr = gbr_model.predict(X_train_split)
output_val_gbr = gbr_model.predict(X_val_split)

correlation_train_gbr = spearmanr(output_train_gbr, Y_train_split).correlation
correlation_val_gbr = spearmanr(output_val_gbr, Y_val_split).correlation

print(f"Spearman's correlation for GBR on training set: {correlation_train_gbr}")
print(f"Spearman's correlation for GBR on validation set: {correlation_val_gbr}")

Spearman's correlation for GBR on training set: 0.9790713556915608
Spearman's correlation for GBR on validation set: 0.07212902296781626


In [170]:
# Random Forest
param_grid_gbr = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [171]:
# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

### grid search

In [172]:
grid_search_gbr = GridSearchCV(gbr_model, param_grid_gbr, cv=kf, scoring='neg_mean_squared_error')
grid_search_gbr.fit(X_train_clean, Y_train_clean)
best_gbr_model = grid_search_gbr.best_estimator_

In [173]:
# Gradient Boost
output_train_best_gbr = best_gbr_model.predict(X_train_split)
output_val_best_gbr = best_gbr_model.predict(X_val_split)

correlation_train_best_gbr = spearmanr(output_train_best_gbr, Y_train_split).correlation
correlation_val_best_gbr = spearmanr(output_val_best_gbr, Y_val_split).correlation

print(f"Spearman's correlation for tuned GBR on training set: {correlation_train_best_gbr}")
print(f"Spearman's correlation for tuned GBR on validation set: {correlation_val_best_gbr}")

Spearman's correlation for tuned GBR on training set: 0.6478203895660904
Spearman's correlation for tuned GBR on validation set: 0.678784002505943


### random search

In [174]:
folds = 3
param_comb = 5

In [175]:
random_search_gbr = RandomizedSearchCV(gbr_model, param_distributions=param_grid_gbr, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=kf.split(X_train_clean, Y_train_clean), verbose=3, random_state=1001 )

random_search_gbr.fit(X_train_clean, Y_train_clean)

best_random_gbr_model = random_search_gbr.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 3/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   4.0s
[CV 4/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   4.0s
[CV 2/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   4.2s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 1/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   4.9s
[CV 5/5] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=nan total time=   3.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 2/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   4.0s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 3/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   4.0s
[CV 1/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   4.9s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 2/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   1.3s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   1.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 4/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   1.2s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   1.3s
[CV 4/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   4.1s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 5/5] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=nan total time=   3.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 5/5] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=nan total time=   1.3s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   5.0s
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   5.1s
[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   5.3s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   5.0s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 1/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   1.5s
[CV 2/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   1.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 3/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   1.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported

Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^

[CV 5/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   1.4s
[CV 4/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   1.5s


Traceback (most recent call last):
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sharmin/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: continuous format is not supported



[CV 5/5] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=nan total time=   5.2s


In [176]:
# XGBoost
output_train_best_random_gbr = best_random_gbr_model.predict(X_train_split)
output_val_best_random_gbr = best_random_gbr_model.predict(X_val_split)

correlation_train_best_random_gbr = spearmanr(output_train_best_random_gbr, Y_train_split).correlation
correlation_val_best_random_gbr = spearmanr(output_val_best_random_gbr, Y_val_split).correlation

print(f"Spearman's correlation for tuned using random search GBR on training set: {correlation_train_best_random_gbr}")
print(f"Spearman's correlation for tuned using random search GBR on validation set: {correlation_val_best_random_gbr}")

Spearman's correlation for tuned using random search GBR on training set: 0.9021233292408362
Spearman's correlation for tuned using random search GBR on validation set: 0.8905930126816939
