In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import Audio, display
def make_noise():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

In [None]:
import numpy as np 
import pandas as pd
pd.options.display.max_columns = 300
import matplotlib.pyplot as plt


In [None]:
from numebot.data.data_constants import NC
from numebot.secret import PUBLIC_ID, SECRET_KEY
from numebot.env import NUMERAI_DATA_FOLDER, MODEL_CONFIGS_PATH
from numebot_private.round_manager_extended import RoundManagerExtended

rm = RoundManagerExtended(NUMERAI_DATA_FOLDER,
                          model_configs_path=MODEL_CONFIGS_PATH,
                          public_id=PUBLIC_ID, 
                          secret_key=SECRET_KEY, 
                          #nrows=50000, testing=True
                         )

# Get list of models with their model file
rm.models_info()

In [None]:
xx = rm.model_cfgs.loc['rpica_4', 'parameters']
#xx = xx.replace('´', '\'').replace('`', '\'').replace('‘', '\'').replace('’', '\'')
xx
d = eval(f'dict({xx})')
d

# General commands 

In [None]:
# Get predictions for each model
rm.generate_predictions_for_all_models()

In [None]:
# Submit predictions (test with rpica_test_3)
# Can I check if I submitted? (for example requesting the scoring)
rm.submit_predictions()

In [None]:
# check if a new round has started
if rm.napi.check_new_round():
    print("new round has started wihtin the last 24hours!")
else:
    print("no new round within the last 24 hours")

In [None]:
pd.DataFrame(rm.napi.get_competitions()).sort_values('openTime', ascending=False)

In [None]:
# Check if submission is done for a model.
rm['rpica'].round_submission_done()
rm['rpica_5'].round_submission_done()

In [None]:
models_dict = rm['rpica'].models_dict
pd.DataFrame(rm.napi.get_submission_filenames(round_num=263, model_id=models_dict['rpica_4']))
pd.DataFrame(rm.napi.get_submission_filenames(round_num=263, model_id=models_dict['rpica_5']))
pd.DataFrame(rm.napi.get_submission_filenames(round_num=263, model_id=models_dict['rpica_test_4']))
pd.DataFrame(rm.napi.get_submission_filenames(round_num=263, model_id=models_dict['rpica_test_5']))


# Monitoring

In [None]:
full_df = rm.mm.download_round_details()
#full_df = rm.mm.load_round_details_csv()
full_df.shape

In [None]:
import numebot.monitoring.metrics_plotter as mp

_ = mp.plot_round_details(full_df, rounds=[262, 263])

_ = mp.plot_round_details(full_df, model_names=['rpica', 'rpica_test_1'])

In [None]:
models_status, models_leaderboard = rm.get_submission_status_and_leaderboard_for_all_models()
models_status.shape, models_leaderboard.shape

models_status.head()
models_leaderboard.head()

In [None]:
# Not sure if this is useful (maybe rolling_score_rep)
www = pd.DataFrame(rm.napi.daily_user_performances("rpica_test_1"))
www.head()

## Correct function to get the data

In [None]:
pd.DataFrame(rm.napi.daily_submissions_performances('rpica_test_1')).sort_values(NC.date, ascending=False).head()
pd.DataFrame(rm.napi.daily_submissions_performances('rpica_4')).sort_values(NC.date, ascending=False).head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15,5))
ax = plt.subplot(1,1,1)
for model_name, model in rm.models_dict.items():
    
    print(model_name)
    df = pd.DataFrame(rm.napi.daily_submissions_performances(model_name)).sort_values(NC.date, ascending=False)
    
    for round_number, round_df in df.groupby('roundNumber'): 
        _=ax.plot(round_df['date'], round_df['correlation'], label='corr')
        _=ax.plot(round_df['date'], round_df['mmc'], label='mmc')
        _=ax.plot(round_df['date'], round_df['fnc'], label='fnc')
    break
    
_=ax.legend()
    

# Check individual model

In [None]:
rm.model_cfgs

# Check individual model

In [None]:
model = rm['rpica_test_5']

In [None]:
status_df = model.get_weekly_submission_status(to_be_saved=False)
lb_df = model.get_daily_leaderboard(to_be_saved=False)

status_df
lb_df

In [None]:
model.submit_predictions()

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBRegressor

iris = load_iris()
model = XGBRegressor(max_depth=5, 
                     learning_rate=0.01, 
                     n_estimators=2000, 
                     colsample_bytree=0.1,
                     n_jobs=-1,)

gkf = KFold(n_splits=5)
distributions = dict(max_depth=[3, 4, 5, 6, 7], 
                     learning_rate=[0.1, 0.01, 0.001], 
                     n_estimators=[1000, 2000, 3000],
                     colsample_bytree=[0.05, 0.1, 0.3, 0.5])

clf = RandomizedSearchCV(model, distributions, random_state=0, cv=gkf, n_iter=20)

search = clf.fit(iris.data, iris.target)
search.best_params_

In [None]:
from sklearn.model_selection import KFold, GroupKFold
data = rm.data
model = XGBRegressor(max_depth=5, 
                     learning_rate=0.01, 
                     n_estimators=2000, 
                     colsample_bytree=0.1,
                     n_jobs=-1,)
gkf = GroupKFold(n_splits=5)
distributions = dict(max_depth=[3, 4, 5, 6, 7], 
                     learning_rate=[0.1, 0.01, 0.001], 
                     n_estimators=[1000, 2000, 3000],
                     colsample_bytree=[0.05, 0.1, 0.3, 0.5])

clf = RandomizedSearchCV(model, 
                         distributions, 
                         random_state=0, 
                         cv=gkf, 
                         scoring='neg_mean_squared_error',
                         verbose=10,
                         n_iter=20)

feature_names = [f for f in data.test.columns if f.startswith("feature")]
search = clf.fit(data.test[feature_names], data.test[NC.target], groups=data.test[NC.era])

search.best_params_

In [None]:
# Train with best parameters
from xgboost import XGBRegressor
feature_names = [f for f in rm.data.test.columns if f.startswith("feature")]

model = XGBRegressor(max_depth=3, 
                     learning_rate=0.01, 
                     n_estimators=2000, 
                     colsample_bytree=0.3,
                     n_jobs=-1,)

model.fit(rm.data.test[feature_names], rm.data.test[NC.target])

rm['rpica_1'].model = model
rm['rpica_1'].save_model()

In [None]:
rm['rpica_1'].model_ready = True
rm['rpica_1'].predict(rm.data.tournament, to_be_saved_for_submission=True)
rm['rpica_1'].submit_predictions()


In [None]:
make_noise()

In [None]:
feature_cols = [f for f in rm.data.tournament.columns if f.startswith("feature")]
output_values = search.predict(rm.data.tournament[feature_cols])
output = pd.DataFrame({'prediction': output_values}, index=rm.data.tournament.index)

output.to_csv(f'ExampleWithTestDataAndTuning_submission.csv', header=True)

# EDA

In [None]:
for dataset, set_name in zip([rm.data.train, rm.data.val, rm.data.test, rm.data.live], ['train', 'val', 'test', 'live']):
    print(f'Info about {set_name}: shape {dataset.shape}')
    #dataset[[col for col in dataset.columns if 'feature' not in col]].head(2)
    dataset.groupby('data_type')['era'].agg(
        ['count',
         'min',
         'max', 
         pd.Series.nunique, 
         lambda x: sorted(list(np.unique(x)))
        ])

In [None]:
rm.data.train[NC.target].unique()
rm.data.val[NC.target].unique()
rm.data.test[NC.target].unique()
rm.data.live[NC.target].unique()

In [None]:
import chart_studio.plotly as py
import plotly.figure_factory as ff

#table = ff.create_table(rm.data.val)
#py.iplot(table, filename='jupyter-table1')

In [None]:
plt.figure(figsize=(15,5))
ax = plt.subplot(1,3,1)
rm.data.train[NC.target].hist(ax=ax)
ax = plt.subplot(1,3,2)
rm.data.val[NC.target].hist(ax=ax)
ax = plt.subplot(1,3,3)
rm.data.test[NC.target].hist(ax=ax)

# Experiment: find eras similarity 

In [None]:
rm.data.train.iloc[:5]

In [None]:
rm.data.all.shape

In [None]:
import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
transformed = pca.fit(rm.data.all[rm.data.features])
transformed = pd.DataFrame(transformed, index=rm.data.all.index)

print(pca.explained_variance_ratio_)

print(pca.singular_values_.shape)

from joblib import dump, load
dump(pca, 'pca_fitted.joblib') 

In [None]:
pca_x = load('pca_fitted.joblib') 

In [None]:
transformed = pca.transform(rm.data.all[rm.data.features])

In [None]:
rm.data.all["data_type"].unique()
colormap = {'train': "blue", 
            'validation': "orange", 
            'test': "red"}

In [None]:
plt.figure(figsize=(15,15))
indexes = rm.data.all.sample(frac=0.1).index
plt.scatter(
    transformed.loc[indexes][0],
    transformed.loc[indexes][1],
    c=rm.data.all.loc[indexes]["data_type"].map(colormap),
    alpha=0.1,
);


# PCA only train val 

In [None]:
import numpy as np
from sklearn.decomposition import PCA

name = 'pca_train_val'
data = rm.data.train_val

pca = PCA(n_components=2)
transformed = pca.fit_transform(data[rm.data.features])
transformed = pd.DataFrame(transformed, index=data.index)

print(pca.explained_variance_ratio_)
print(pca.singular_values_.shape)

from joblib import dump, load
dump(pca, 'pca_fitted.joblib') 

In [None]:
pca_x = load('pca_fitted.joblib') 

In [None]:
data["data_type"].unique()
colormap = {'train': "blue", 
            'validation': "orange", 
            'test': "red"}

In [None]:
plt.figure(figsize=(15,15))
indexes = data.sample(frac=1).index
plt.scatter(
    transformed.loc[indexes][0],
    transformed.loc[indexes][1],
    c=data.loc[indexes]["data_type"].map(colormap),
    alpha=0.1,
);


In [None]:
all = rm.data.all
transformed = pca.transform(all[rm.data.features])

In [None]:
all.shape
transformed[:,0].shape

In [None]:
projection_with_era = pd.DataFrame({
    'era': all['era'],
    'data_type': all['data_type'],
    'trans_0': transformed[:,0],
    'trans_1': transformed[:,1],
})

means = projection_with_era.groupby('era').mean()
stds = projection_with_era.groupby('era').std()

data_type = projection_with_era.groupby('era')['data_type'].agg(pd.Series.mode)


plt.figure(figsize=(15,15))
plt.scatter(
    means['trans_0'],
    means['trans_1'],
    c=data_type.map(colormap),
    #c=rm.data.all.loc[indexes]["era"],#.map(colormap),
    #alpha=0.1,
);

plt.figure(figsize=(15,15))
plt.scatter(
    stds['trans_0'],
    stds['trans_1'],
    c=data_type.map(colormap),
    #c=rm.data.all.loc[indexes]["era"],#.map(colormap),
    #alpha=0.1,
);

# TNSE 

In [None]:
import numpy as np

from sklearn.manifold import TSNE

tsne = TSNE(random_state=17)
transformed = tsne.fit_transform(rm.data.all[rm.data.features])

from joblib import dump, load
dump(tsne, 'tsne_fitted.joblib') 

transformed = pd.DataFrame(transformed, index=rm.data.all.index)

In [None]:
from cuml.manifold import TSNE
tsne = TSNE(n_components = 2)
transformed = tsne.fit_transform(rm.data.all[rm.data.features])

from joblib import dump, load
dump(tsne, 'tsne_fitted_cuda.joblib') 

transformed = pd.DataFrame(transformed, index=rm.data.all.index)


In [None]:
cmap_names = ['Pastel1', 'Pastel2', 'Paired', 'Accent',
                        'Dark2', 'Set1', 'Set2', 'Set3',
                        'tab10', 'tab20', 'tab20b', 'tab20c']
cmap=plt.get_cmap(cmap_names[-3])
for c in cmap:
    print(c)

In [None]:
rm.data.all["data_type"].unique()
colormap = {'train': "blue", 
            'validation': "green", 
            'test': "red"}

In [None]:
projection_with_era = pd.DataFrame({
    'era': rm.data.all['era'],
    'data_type':rm.data.all['data_type'],
    'trans_0': transformed[0],
    'trans_1': transformed[1],
})

means = projection_with_era.groupby('era').mean()
stds = projection_with_era.groupby('era').std()

data_type = projection_with_era.groupby('era')['data_type'].agg(pd.Series.mode)

plt.figure(figsize=(15,15))
plt.scatter(
    transformed[0],
    transformed[1],
    #c=data_type.map(colormap),
    #c=rm.data.all.loc[indexes]["era"],#.map(colormap),
    #alpha=0.1,
);

plt.figure(figsize=(15,15))
plt.scatter(
    means['trans_0'],
    means['trans_1'],
    c=data_type.map(colormap),
    #c=rm.data.all.loc[indexes]["era"],#.map(colormap),
    #alpha=0.1,
);

plt.figure(figsize=(15,15))
plt.scatter(
    stds['trans_0'],
    stds['trans_1'],
    c=data_type.map(colormap),
    #c=rm.data.all.loc[indexes]["era"],#.map(colormap),
    #alpha=0.1,
);

In [None]:
plt.figure(figsize=(15,15))

indexes = rm.data.all.sample(frac=0.1).index
plot_eras = rm.data.train_val['era'].unique()

plot_eras = np.random.choice(plot_eras,2)
print(plot_eras)
is_in_eras = rm.data.all['era'].isin(plot_eras)
 #is_in_eras
indexes = rm.data.all.loc[is_in_eras].sample(frac=0.1).index
print(len(indexes), 'of', len(rm.data.all))
print(len(plot_eras))
plt.scatter(
    transformed.loc[indexes][0],
    transformed.loc[indexes][1],
    #c=rm.data.all.loc[indexes]["data_type"].map(colormap),
    c=rm.data.all.loc[indexes]["era"],#.map(colormap),
    #alpha=0.1,
);


In [None]:
make_noise()