# Using auto-sklearn
Links:

- https://automl.github.io/auto-sklearn/master/index.html


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import Audio, display
def make_noise():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

In [None]:
data_folder = '/home/pica/nas_pica/Data/numerai/'

from numebot_private.round_manager_extended import RoundManagerExtended
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from numebot.secret import PUBLIC_ID, SECRET_KEY
import sklearn

In [None]:
from numebot.data.data_constants import NC
from numebot.secret import PUBLIC_ID, SECRET_KEY
from numebot.env import NUMERAI_DATA_FOLDER, MODEL_CONFIGS_PATH

from numebot_private.round_manager_extended import RoundManagerExtended

rm = RoundManagerExtended(NUMERAI_DATA_FOLDER,
                          model_configs_path=MODEL_CONFIGS_PATH,
                          public_id=PUBLIC_ID, 
                          secret_key=SECRET_KEY, 
                          save_memory=False,
                          #nrows=100000,# testing=True
                         )

# Get list of models with their model file
rm.models_info()

# On train val data

In [None]:
feature_cols = [f for f in rm.data.train.columns if f.startswith("feature")]
    
X_train = rm.data.train_val[feature_cols].values
y_train = rm.data.train_val[NC.target].values
groups_train = rm.data.train_val[NC.era].values

X_val = rm.data.val[feature_cols]
y_val = rm.data.val[NC.target]

X_test = rm.data.test[feature_cols]
y_test = rm.data.test[NC.target]

In [None]:
groups_train.shape
X_train.shape

In [None]:
import autosklearn.regression
from sklearn.model_selection import GroupKFold
from autosklearn.metrics import mean_squared_error
import shutil
from pathlib import Path

n_hours = int(3600*24*1.5)

model_name = f'automl_{n_hours}h'
file = f'{model_name}.pkl'

temp_folder = Path(f'/tmp/autosklearn_regression_{model_name}_tmp')

if temp_folder.exists(): shutil.rmtree(temp_folder) 

automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=n_hours,
    tmp_folder=temp_folder,
    memory_limit=16000,
    n_jobs=5,
    resampling_strategy=GroupKFold,
    resampling_strategy_arguments={'n_splits':5, 'groups': groups_train},
    metric=mean_squared_error,
)

automl.fit(X_train, y_train)
automl.refit(X_train, y_train)

import pickle

pickle.dump(automl, open(file,'wb'))

In [None]:
print(automl.show_models())

In [None]:
predictions = automl.predict(X_val)
print("R2 score:", sklearn.metrics.r2_score(y_val, predictions))
print("RMS score:", sklearn.metrics.mean_squared_error(y_val, predictions))

In [None]:
feature_cols = [f for f in rm.data.tournament.columns if f.startswith("feature")]
output_values = automl.predict(rm.data.tournament[feature_cols])
output = pd.DataFrame({'prediction': output_values}, index=rm.data.tournament.index)

output.to_csv(f'{model_name}_submission.csv', header=True)

In [None]:
y_val.hist(bins=100)
pd.Series(predictions).hist()


In [None]:
import pickle
automl_2 = pickle.load(open(file,'rb'))
automl_2.predict(X_val)

In [None]:
predictions

# On test data 

In [None]:
feature_cols = [f for f in rm.data.test.columns if f.startswith("feature")]
    
X_train = rm.data.test[feature_cols].values
y_train = rm.data.test[NC.target].values
groups_train = rm.data.test[NC.era].values

In [None]:
groups_train.shape
X_train.shape

In [None]:
rm.data.test[NC.target].unique()

In [None]:
rm.data.test.head()

In [None]:
np.unique(X_train)

In [None]:
np.unique(y_train)

In [None]:
import autosklearn.regression
from sklearn.model_selection import GroupKFold
from autosklearn.metrics import mean_squared_error
import shutil
from pathlib import Path

n_hours = int(3600*24*1.5)

model_name = f'automl_test_{n_hours//3600}h'
file = f'{model_name}.pkl'

temp_folder = Path(f'/tmp/autosklearn_regression_{model_name}_tmp')

if temp_folder.exists(): shutil.rmtree(temp_folder) 

automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=n_hours,
    tmp_folder=temp_folder,
    memory_limit=16000,
    n_jobs=5,
    resampling_strategy=GroupKFold,
    resampling_strategy_arguments={'n_splits':5, 'groups': groups_train},
    metric=mean_squared_error,
)

automl.fit(X_train, y_train)
automl.refit(X_train, y_train)

import pickle

pickle.dump(automl, open(file,'wb'))

In [None]:
print(f'Finisehd {model_name}')
print(automl.show_models())