# Using mljar - Automl

Links:

- https://github.com/mljar/mljar-supervised
- https://supervised.mljar.com/
- https://github.com/mljar/mljar-supervised/blob/ede835a4f6d2fa478477b24d2728b3dd97f5351a/supervised/automl.py#L15


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import Audio, display
def make_noise():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

In [None]:
data_folder = '/home/pica/nas_pica/Data/numerai/'

from numebot_private.round_manager_extended import RoundManagerExtended
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from numebot.secret import PUBLIC_ID, SECRET_KEY

In [None]:
data_folder = '/home/pica/nas_pica/Data/numerai/'

from numebot.data.data_constants import NC
from numebot.secret import PUBLIC_ID, SECRET_KEY

from numebot_private.round_manager_extended import RoundManagerExtended

rm = RoundManagerExtended(data_folder, 
                          public_id=PUBLIC_ID, 
                          secret_key=SECRET_KEY, 
                          save_memory=False,
                          nrows=10000, testing=True
                         )

# Get list of models with their model file
rm.models_info()

In [None]:
from supervised.automl import AutoML 
feature_cols = [f for f in rm.data.train.columns if f.startswith("feature")]
    
X_train = rm.data.train[feature_cols]
y_train = rm.data.train['target']#.astype(float).values

In [None]:
X_val = rm.data.val[feature_cols]
y_val = rm.data.val[NC.target]
X_test = rm.data.test[feature_cols]
y_test = rm.data.test[NC.target]

In [None]:
from pathlib import Path
import shutil

In [None]:
automl_results_path = '/home/pica/nas_pica/Data/numerai/models/sandbox/automl_test'

if Path(automl_results_path).exists():
    shutil.rmtree(automl_results_path)

automl = AutoML(
    results_path=automl_results_path,
    mode="Perform", 
    total_time_limit=60,#3600*5, 
    ml_task='regression',    
    eval_metric='spearman',
)
automl.fit(X_train, y_train)

In [None]:
X_test.head()
y_test[:5]

pd.Series(y_test).hist()
len(y_test)

In [None]:
# val
val_predictions = automl.predict(X_val)
print("Test MSE:", mean_squared_error(y_val, val_predictions))
# compute the MSE on test data
test_predictions = automl.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, test_predictions))

In [None]:
# Save the submission file
predictions = pd.Series(predictions, index=rm.data.tournament.index)
predictions.head()
predictions.hist()

predictions = pd.DataFrame(predictions).rename({0: 'prediction'}, axis=1)

predictions.to_csv('auto_ml_submission.csv', header=True)

In [None]:
pd.Series(y_test).hist()
predictions.hist()
len(y_test)

In [None]:
# compute the MSE on train data
predictions_train = automl.predict(X_train)
print("Train MSE:", mean_squared_error(y_train, predictions_train))

predictions_train = pd.Series(predictions_train, index=rm.data.train.index)
predictions_train.head()
predictions_train.hist()

pd.Series(y_train).hist()
predictions_train.hist()
len(y_train)

# RMSE 

In [None]:
automl = AutoML(
    mode="Perform", 
    total_time_limit=3600*5, 
    ml_task='regression',    
    eval_metric='rmse',
)
automl.fit(X_train, y_train)

In [None]:
X_test.head()
y_test[:5]

In [None]:
# Get preds for a model
output = model.predict(rm.data.tournament)
output.shape
output.head()

# Get performance of the model

In [None]:
# Get predictions for each model
rm.generate_predictions_for_all_models()

In [None]:
# Submit predictions (test with rpica_test_3)
# Can I check if I submitted? (for example requesting the scoring)


In [None]:
_=[print(attr) for attr in dir(napi) if not attr.startswith('_')]

In [None]:
# get competitions
all_competitions = napi.get_competitions()
all_competitions[:2]

In [None]:
# get leaderboard for the current round
leaderboard = napi.get_leaderboard(limit=10000)
len(leaderboard)
leaderboard_dict = {competitor['username']:competitor for competitor in leaderboard}

In [None]:
leaderboard_dict['rpica']
leaderboard_dict['rpica_test_1']

In [None]:
# check if a new round has started
if napi.check_new_round():
    print("new round has started wihtin the last 24hours!")
else:
    print("no new round within the last 24 hours")

In [None]:
# provide api tokens
from numebot.secret import PUBLIC_KEY, PRIVATE_KEY


In [None]:
napi = numerapi.NumerAPI(PUBLIC_KEY, PRIVATE_KEY)

In [None]:
models_dict = napi.get_models()

# upload predictions
#submission_id = napi.upload_predictions("preds.csv", tournament=1)
# check submission status
napi.submission_status(model_id=models_dict['rpica'])

In [None]:
import pandas as pd
from xgboost import XGBRegressor
from pathlib import Path

DATA_FOLDER = Path('/home/pica/hdd/nas/Data/numerai/21-03-14 weekly/')
DATA_FOLDER = Path('/home/pica/hdd/nas/Data/numerai/numerai_dataset_258/')
OUTPUT_FOLDER = Path('/home/pica/hdd/nas/Data/numerai/output/')

# train data contains features and targets
training_data = pd.read_csv(DATA_FOLDER/"numerai_training_data.csv").set_index("id")

# tournament data contains features only
tournament_data = pd.read_csv(DATA_FOLDER/"numerai_tournament_data.csv").set_index("id")
feature_names = [f for f in training_data.columns if "feature" in f]

live_data = tournament_data[tournament_data['data_type'] == 'live']
tournament_data = tournament_data[tournament_data['data_type'] != 'live']

training_data['era'] = training_data['era'].str.lstrip('era').astype(int)
tournament_data['era'] = tournament_data['era'].str.lstrip('era').astype(int)
live_data['era'] = live_data['era'].str.lstrip('era')

training_data.shape
tournament_data.shape
live_data.shape

## EDA

In [None]:
training_data.info()
live_data.info()
tournament_data.info()

In [None]:
import numpy as np

In [None]:
for dataset, set_name in zip([training_data, tournament_data, live_data], ['train', 'tournament', 'live']):
    print(f'Info about {set_name}: shape {dataset.shape}')
    #dataset[[col for col in dataset.columns if 'feature' not in col]].head(2)
    dataset.groupby('data_type')['era'].agg(['count','min','max', pd.Series.nunique, lambda x: sorted(list(np.unique(x)))])
    

train_era = training_data

## Experiment with feature neutralization 

In [None]:
def neutralize(df, target="prediction_kazutsugi", by=None, proportion=1.0):
    if by is None:
        by = [x for x in df.columns if x.startswith('feature')]

    scores = df[target]
    exposures = df[by].values

    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))

    scores -= proportion * (exposures @ (np.linalg.pinv(exposures) @ scores.values))
    return scores / scores.std()

In [None]:
DATA_FOLDER = Path('/home/pica/hdd/nas/Data/numerai/numerai_dataset_258/')
OUTPUT_PATH = DATA_FOLDER/'submission.csv'

In [None]:
outputs = pd.read_csv(OUTPUT_PATH).set_index("id")
outputs.shape
outputs.head(2)
len(outputs) - outputs.nunique()

In [None]:
tournament_data = pd.read_csv(DATA_FOLDER/"numerai_tournament_data.csv").set_index("id")
tournament_data.shape

In [None]:
tournament_data['target'] = outputs
tournament_data.shape

In [None]:
neutralized = neutralize(tournament_data, target='target')

In [None]:
neutralized = pd.DataFrame(neutralized)
neutralized.head(2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

neutralized.rename({'target': 'prediction'}, axis=1, inplace=True)
neutralized.head()
neutralized_scaled = neutralized.copy()
neutralized_scaled[['prediction']] = scaler.fit_transform(neutralized[['prediction']])

In [None]:
neutralized_scaled.describe().loc[['min', 'max'], :]

In [None]:
neutralized_scaled.to_csv(DATA_FOLDER/"submission_neutralized.csv", header=True)

## train

In [None]:
# train a model to make predictions on tournament data
model = XGBRegressor(max_depth=5, learning_rate=0.01, \
                     n_estimators=2000, colsample_bytree=0.1, n_jobs=-1)
model.fit(training_data[feature_names], training_data["target"])

# submit predictions to numer.ai
predictions = model.predict(tournament_data[feature_names])

In [None]:
predictions = pd.DataFrame({'predictions': predictions}, index=tournament_data.index)

In [None]:
pd.Series(predictions).shape
OUTPUT_FOLDER.mkdir(exist_ok=True, parents=True)
pd.Series(predictions).to_csv(OUTPUT_FOLDER/"predictions.csv")

In [None]:
for data in [training_data, tournament_data]:
    print(data.shape)
    print(data[['era']].nunique())
    print(data[['era']].max())
    print(data.index.nunique())
    data.head()
    print('')
    

In [None]:
predictions

In [None]:
training_data.head()
tournament_data.head()

In [None]:
tournament_data.shape
tournament_data.head()

for era in tournament_data['era'].unique():
    ids_in_era = tournament_data[tournament_data['era'] == era].index
    era_preds = predictions[predictions.index.isin(ids_in_era)]
    
    if era_preds['predictions'].nunique() != len(era_preds):
        print(f'Repeated values in {era}: len {len(era_preds)}, unique values: {era_preds["predictions"].nunique()}')

In [None]:
# compute the MSE on test data
predictions = automl.predict(X_test)
valid = ~np.isnan(y_test)
print("Test MSE:", mean_squared_error(y_test[valid], predictions[valid]))

predictions = pd.Series(predictions, index=rm.data.tournament.index)
predictions.head()
predictions.hist()

predictions = pd.DataFrame(predictions).rename({0: 'prediction'}, axis=1)
#predictions.to_csv('auto_ml_submission.csv', header=True)

In [None]:
pd.Series(y_test).hist()
predictions.hist()
len(y_test)

In [None]:
# compute the MSE on train data
predictions_train = automl.predict(X_train)
print("Train MSE:", mean_squared_error(y_train, predictions_train))

predictions_train = pd.Series(predictions_train, index=rm.data.train.index)
predictions_train.head()
predictions_train.hist()

pd.Series(y_train).hist()
predictions_train.hist()
len(y_train)

# R²

In [None]:
automl = AutoML(
    mode="Perform", 
    total_time_limit=3600*10, 
    ml_task='regression',    
    eval_metric='r2',
)
automl.fit(X_train, y_train)

In [None]:
X_test.head()
y_test[:5]

In [None]:
# compute the MSE on test data
predictions = automl.predict(X_test)
valid = ~np.isnan(y_test)
print("Test MSE:", mean_squared_error(y_test[valid], predictions[valid]))

predictions = pd.Series(predictions, index=rm.data.tournament.index)
predictions.head()
predictions.hist()

predictions = pd.DataFrame(predictions).rename({0: 'prediction'}, axis=1)
#predictions.to_csv('auto_ml_submission.csv', header=True)

In [None]:
pd.Series(y_test).hist()
predictions.hist()
len(y_test)

In [None]:
# compute the MSE on train data
predictions_train = automl.predict(X_train)
print("Train MSE:", mean_squared_error(y_train, predictions_train))

predictions_train = pd.Series(predictions_train, index=rm.data.train.index)
predictions_train.head()
predictions_train.hist()

pd.Series(y_train).hist()
predictions_train.hist()
len(y_train)

# rmse - with noisy data

In [None]:
train_with_noise = rm.data.noisy_training(5)
print(train_with_noise.shape)
for col in feature_cols:
    train_with_noise[col] = train_with_noise[col].astype(float)
    
X_train = train_with_noise[feature_cols]
y_train = train_with_noise['target'].astype(float).values

In [None]:
automl = AutoML(
    mode="Perform", 
    total_time_limit=3600*6, 
    ml_task='regression',    
    eval_metric='rmse',
    n_jobs=22,
)
automl.fit(X_train, y_train)

In [None]:
X_test.head()
y_test[:5]

In [None]:
# compute the MSE on test data
predictions = automl.predict(X_test)
valid = ~np.isnan(y_test)
print("Test MSE:", mean_squared_error(y_test[valid], predictions[valid]))

predictions = pd.Series(predictions, index=rm.data.tournament.index)
predictions.head()
predictions.hist()

predictions = pd.DataFrame(predictions).rename({0: 'prediction'}, axis=1)

In [None]:
predictions.to_csv('auto_ml_submission_with_noise.csv', header=True)

In [None]:
from IPython.display import Audio, display
def make_noise():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
make_noise()

In [None]:
pd.Series(y_test).hist()
predictions.hist()
len(y_test)

In [None]:
# compute the MSE on train data
predictions_train = automl.predict(X_train)
print("Train MSE:", mean_squared_error(y_train, predictions_train))

In [None]:
predictions_train = pd.Series(predictions_train, index=train_with_noise.index)
predictions_train.head()
predictions_train.hist()

pd.Series(y_train).hist()
predictions_train.hist()
len(y_train)

In [None]:
# carefull i run and remove a create - noisy df cell

In [None]:
with_noise = rm.data.noisy_training(2)