In [None]:
import cudf as cu
import joblib


import numpy as np 
import pandas as pd 
import matplotlib
import cudf 
import time
from tqdm.notebook import tqdm
matplotlib.rc('image', cmap='Greys')

from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.inspection import plot_partial_dependence


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import xgboost as xgb 
import optuna 

from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [None]:
path = Path('../input/jane-street-market-prediction')
out_path = Path('./')

In [None]:
%%time
train_cudf  = cudf.read_csv(path/'train.csv', nrows=200000)
df = train_cudf.to_pandas()
del train_cudf
features = pd.read_csv(path/'features.csv')
example_test = pd.read_csv(path/'example_test.csv')
sample_prediction_df = pd.read_csv(path/'example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
print('train shape is {}'.format(df.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

In [None]:
#  Loading Dataset 
df.head()

In [None]:
list(df.columns)

In [None]:
df = df[df['weight'] != 0].reset_index(drop = True) 

In [None]:
df.shape

In [None]:
dep_var = 'action'
df[dep_var] = (((df['resp']* df['weight'])>0)*1).astype('category')

In [None]:
df = df.loc[:, ~df.columns.str.contains('resp')]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
def augment_df(df):
    df = df.copy()
    for feature in features:
        df[f'sq_{feature}'] = df[feature]**2
        df[f'repo_{feature}'] = df[feature].apply(lambda x: 0 if x==0 else 1/x)
    df['min'] = df[features].min(axis=1)
    df['mean'] = df[features].mean(axis=1)
    df['max'] = df[features].max(axis=1)
    df['median'] = df[features].median(axis=1)
    df['std'] = df[features].std(axis=1)
    df['var'] = df[features].var(axis=1)
    df['abs_mean'] = df[features].abs().mean(axis=1)
    df['abs_median'] = df[features].abs().median(axis=1)
    df['abs_std'] = df[features].abs().std(axis=1)
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurt(axis=1)
    df['sq_kurt'] = df[[f'{feature}' for feature in features]].kurt(axis=1)
    return df

In [None]:
features = [feature for feature in df.columns if 'feature' in feature]
len(features)

In [None]:
df = augment_df(df)

In [None]:
df.shape

In [None]:
features_all = list(df.columns)
features_all.remove('ts_id')
features_keep = list(df.columns)
features_keep.remove('action')
features_keep.remove('ts_id')

print(len(features_all), len(features_keep))

In [None]:
df = df.fillna(df.median())

In [None]:
list(df.isnull().sum())

In [None]:
# getting continous and categorical variables from dataset 
def cont_cat_split(df, max_card=20, dep_var=None):
    "Helper function that returns column names of cont and cat variables from given `df`."
    cont_names, cat_names = [], []
    for label in df:
        if label in L(dep_var): continue
        if (pd.api.types.is_integer_dtype(df[label].dtype) and
            df[label].unique().shape[0] > max_card or
            pd.api.types.is_float_dtype(df[label].dtype)):
            cont_names.append(label)
        else: cat_names.append(label)
    return cont_names, cat_names


cont_nn,cat_nn = cont_cat_split(df.loc[:,features_all], max_card=9000, dep_var=dep_var)

In [None]:
cat_nn, cont_nn

In [None]:
def create_train_ds(df, cat, cont, y_name, splits):
    splits = RandomSplitter(valid_pct=0.1)(range_of(df))
    procs  = [Categorify, FillMissing, Normalize] 
    to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
    
    xs, y = to.train.xs.astype(np.float32), to.train.y.astype(np.float32)
    valid_xs,valid_y = to.valid.xs.astype(np.float32),to.valid.y.astype(np.float32)
    
    print(f"length of train {len(to.train)} and legnth of valid {len(to.valid)}")
    
    return to, xs, valid_xs, y, valid_y

# old parameters n_estimators=40, max_samples=50_000
def rf(xs, y, n_estimators=40, max_samples=5_000, max_features='auto', min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators, max_samples=max_samples,
                                 max_features=max_features, min_samples_leaf = min_samples_leaf,
                                 criterion = 'entropy',
                                 oob_score=True).fit(xs, y)

def m_accuracy(m, xs, y):
    return accuracy_score(y.values, m.predict(xs))

def roc_auc(m, xs, valid_xs, y, valid_y):
    y_proba = m.predict_proba(valid_xs)
    roc_auc = roc_auc_score(valid_y,m.predict(valid_xs))
    fpr, tpr, thresholds = roc_curve(valid_y, y_proba[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='(area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

def rf_cu(xs, y, n_bins=16, n_estimators=40, max_depth=16, max_features=1.0,
         n_streams=1, split_criterion=0, **kwargs):
    return cusRandomForestClassifier(n_bins=n_bins,
                                     n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     max_features=max_features,
                                     n_streams=n_streams,
                                     split_criterion=split_criterion).fit(xs, y)

def _feature_importance(m, df):
    return pd.DataFrame({'cols':df.columns,'imp':m.feature_importances_}).sort_values('imp',
                                                                                     ascending=False)

def plot_feature_importance(fi):
    return fi.plot('cols','imp', 'barh', figsize=(15,15), legend=False)

In [None]:
splits = RandomSplitter(valid_pct=0.1)(range_of(df))

procs  = [Categorify, FillMissing, Normalize] 

In [None]:
to_nn,xs_nn,valid_xs_nn,y_nn,valid_y_nn = create_train_ds(df.loc[:,features_all], cat_nn, cont_nn, dep_var, splits)

In [None]:
to_nn.show(3)

In [None]:
# saving tabular objects 
#save_pickle(out_path/'to.pkl', to)

In [None]:
# loading tabular object
#to = load_pickle(out_path/'to.pkl')

## Fastai and using output and input.

In [None]:
dls_nn = to_nn.dataloaders(bs=1024)

In [None]:
dls_nn.show_batch()

In [None]:
roc_auc = RocAucBinary()

In [None]:
learn = tabular_learner(dls_nn,layers=[500,25], metrics=[accuracy,roc_auc])

In [None]:
learn.lr_find()

In [None]:
cbs = [GradientAccumulation(),
       EarlyStoppingCallback(monitor='accuracy', comp=np.greater, min_delta=0.01, patience=4), 
       SaveModelCallback(monitor='accuracy', comp=np.greater, min_delta=0.01),
       ReduceLROnPlateau(monitor='accuracy', comp=np.greater, min_delta=0.01, patience=2)]

In [None]:
learn.fit_one_cycle(20, 1e-3, cbs=cbs)

In [None]:
learn = tabular_learner(dls_nn,layers=[1000,200,15], metrics=[accuracy,roc_auc])

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(20, 1e-2, cbs=cbs)

In [None]:
learn.fit_one_cycle(20,slice(1e-03),wd = 0.0001, cbs=cbs)

In [None]:
preds, targs = learn.get_preds()

In [None]:
cm = confusion_matrix(valid_y_nn,to_np(preds.argmax(dim=-1)))
cm

In [None]:
test_dl =  dls_nn.test_dl(df.drop('action', axis=1))
preds = learn.get_preds(dl=test_dl)

merging output to the dataset

In [None]:
df['feature_fastai'] = pd.DataFrame(to_np(preds[0].argmax(dim=-1)))

In [None]:
list(df.isnull().sum())

In [None]:
features_all_ = list(df.columns)
features_all_.remove('ts_id')
features_keep_ = list(df.columns)
features_keep_.remove('action')
features_keep_.remove('ts_id')

print(len(features_all_), len(features_keep_))

## Xgboost

In [None]:
# using random forest as baseline 
# old parameters n_estimators=40, max_samples=50_000
def rf(xs, y, n_estimators=40, max_samples=5_000, max_features='auto', min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators, max_samples=max_samples,
                                 max_features=max_features, min_samples_leaf = min_samples_leaf,
                                 criterion = 'entropy',
                                 oob_score=True).fit(xs, y)

def m_accuracy(m, xs, y):
    return accuracy_score(y.values, m.predict(xs))

def roc_auc(m, xs, valid_xs, y, valid_y):
    y_proba = m.predict_proba(valid_xs)
    roc_auc = roc_auc_score(valid_y,m.predict(valid_xs))
    fpr, tpr, thresholds = roc_curve(valid_y, y_proba[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='(area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

def rf_cu(xs, y, n_bins=16, n_estimators=40, max_depth=16, max_features=1.0,
         n_streams=1, split_criterion=0, **kwargs):
    return cusRandomForestClassifier(n_bins=n_bins,
                                     n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     max_features=max_features,
                                     n_streams=n_streams,
                                     split_criterion=split_criterion).fit(xs, y)

def _feature_importance(m, df):
    return pd.DataFrame({'cols':df.columns,'imp':m.feature_importances_}).sort_values('imp',
                                                                                     ascending=False)

def plot_feature_importance(fi):
    return fi.plot('cols','imp', 'barh', figsize=(15,15), legend=False)

In [None]:
X = df.loc[:,features_keep_]
y =df.loc[:, 'action']

X.shape, y.shape

In [None]:
features_ = X.columns

In [None]:
X["feature_fastai"].astype('category')
X["date"].astype('category')
X["feature_0"].astype('category')
X["sq_feature_0"].astype('category');

In [None]:
xs, valid_xs, y, valid_y = train_test_split(X, y, test_size=0.1, random_state = 42)

In [None]:
dtrain = xgb.DMatrix(xs, label=y)
dvalid = xgb.DMatrix(valid_xs, label=valid_y)

def objective(trial):
    # parameters for hypertunning 
    params = {'n_estimators': trial.suggest_int('n_estimators',400, 600),
              'max_depth': trial.suggest_int('max_depth', 10, 20),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
              'subsample' : trial.suggest_uniform('subsample', 0.50, 1),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
              'gamma': trial.suggest_int('gamma', 0, 10),
              'tree_method': 'gpu_hist',
              'objective': 'binary:logistic'}
    
    best = xgb.train(params, dtrain) 
    preds = np.rint(best.predict(dvalid))
    
    accuracy = accuracy_score(valid_y, preds)
    
    return accuracy



In [None]:
study = optuna.create_study()
study.optimize(objective,n_trials=15)

In [None]:
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['objective'] = 'binary:logistic'

In [None]:
clf = xgb.XGBClassifier(**best_params)
%time clf.fit(xs, y)
m_accuracy(clf, xs, y), m_accuracy(clf, valid_xs, valid_y)

In [None]:
cm = confusion_matrix(valid_y,  clf.predict(valid_xs))
cm

In [None]:
roc_auc(clf, xs, valid_xs, y, valid_y)

In [None]:
fi = _feature_importance(clf, xs)
fi[:15]

In [None]:
plot_feature_importance(_feature_importance(clf, xs)[:30]);

In [None]:
fig, ax = plt.subplots(figsize=(12,4))
plot_partial_dependence(clf, valid_xs, ['feature_fastai','date', 'feature_107','feature_116'],
                        grid_resolution=20, ax=ax);

In [None]:
#xs_imp = xs.drop(['feature_fastai'], axis=1)
#valid_xs_imp = valid_xs.drop(['feature_fastai'], axis=1)

In [None]:
#clf_imp = xgb.XGBClassifier(**best_params)
#%time clf_imp.fit(xs_imp, y)

In [None]:
#m_accuracy(clf_imp, xs_imp, y), m_accuracy(clf_imp, valid_xs_imp, valid_y)#

In [None]:
#cm = confusion_matrix(valid_y,  clf_imp.predict(valid_xs_imp))
#cm

In [None]:
#roc_auc(clf_imp, xs_imp, valid_xs_imp, y, valid_y)

In [None]:
#plot_feature_importance(_feature_importance(clf_imp, xs_imp)[:30]);

In [None]:
#fig, ax = plt.subplots(figsize=(12,4))
#plot_partial_dependence(clf_imp, valid_xs_imp, ['date','feature_43', 'feature_45','feature_39'],
#                        grid_resolution=20, ax=ax);

In [None]:
clf_2 = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=11,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    missing=-999,
    random_state=2020,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

%time clf_2.fit(xs, y)

m_accuracy(clf_2, xs, y), m_accuracy(clf_2, valid_xs, valid_y)

In [None]:
%time cm = confusion_matrix(valid_y,  clf_2.predict(valid_xs))
cm

In [None]:
roc_auc(clf_2, xs, valid_xs, y, valid_y)

In [None]:
plot_feature_importance(_feature_importance(clf_2, xs)[:30]);

In [None]:
fig, ax = plt.subplots(figsize=(12,4))
plot_partial_dependence(clf_2, valid_xs, ['feature_fastai','feature_119', 'feature_107'],
                        grid_resolution=20, ax=ax);

In [None]:
cols_build = clf.get_booster().feature_names

# submission

In [None]:
#example_test = augment_df(example_test)
#example_test.fillna(df.mean(), inplace=True)

#test_dl =  dls_nn.test_dl(example_test)
#preds = learn.get_preds(dl=test_dl)

#example_test['feature_fastai'] = pd.DataFrame(to_np(preds[0].argmax(dim=-1)))

#example_test["feature_fastai"].astype('category')
#example_test["date"].astype('category')
#example_test["feature_0"].astype('category')
#example_test["sq_feature_0"].astype('category');

#cols_build = clf.get_booster().feature_names
#preds = clf.predict(example_test[cols_build])

#example_test['action'] = preds

#to_keep = ['ts_id','action']

#example_test = example_test[to_keep]

#example_test.to_csv('submission.csv', columns=to_keep, index=False)

In [None]:
from tqdm import tqdm
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an ite-ator which loops over the test set

In [None]:
for (test_df, pred_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        test_df = augment_df(test_df)
        test_df.fillna(df.mean(), inplace=True)
        print(len(test_df.columns))
        test_df = test_df.loc[:, features_keep]
        test_dl =  dls_nn.test_dl(test_df)
        preds = learn.get_preds(dl=test_dl)
        test_df['feature_fastai'] = pd.DataFrame(to_np(preds[0].argmax(dim=-1)))
        #test_df["feature_fastai"].astype('category')
        #test_df["date"].astype('category')
        #test_df["feature_0"].astype('category')
        #test_df["sq_feature_0"].astype('category');
        preds = clf.predict(test_df[cols_build])
        pred_df.action = preds
    else:
        pred_df.action = 0
    env.predict(pred_df)
        
        