# 1. Import packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
# Standard plotly imports
#import plotly as py
#import plotly.graph_objs as go
#import plotly.tools as tls
#from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
#import plotly.figure_factory as ff
import os
import warnings
warnings.filterwarnings("ignore")
#from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
import gc
import cudf

# 2. Import Data

In [None]:
%%time
data = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
print ("Data is loaded!")

In [None]:
data.head(10)

In [None]:
data = data.fillna(-99)
features = [c for c in data.columns if 'feature' in c]
data["feature_x1"] = data["feature_129"] / data["feature_127"] # just ....
data["feature_x2"] = data["feature_125"] / data["feature_123"] # nothing special !! 

#data = data.astype('float32')
data = data[data.weight != 0]
data['action'] = (data['resp']>0)*1

 
train = data[(data.date <= 450)]

valid = data[data.date > 450]
weights = train["weight"]

X_train = train.loc[:, features]
y_train = train.loc[:, 'action']

X_valid = valid.loc[:, features]
y_valid = valid.loc[:, 'action']
del train, data
gc.collect()

In [None]:
print("train shape", X_train.shape)
print("valid shape", X_valid.shape)

# 3 - XGBoost model

In [None]:
# Prepare data and train xgboost on GPU
params = dict(
    objective='binary:logistic',
    max_depth=10,
    learning_rate=0.05,
    eval_metric = 'auc',
    colsample_bytree=0.7,
    missing=-99,
    random_state=2020,
    tree_method='gpu_hist')

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)

In [None]:
clf = xgb.train(
    params, dtrain, 
    num_boost_round=500, 
    verbose_eval=50,
    early_stopping_rounds=100,
    evals=[(dtrain, 'train'), (dvalid, 'eval')],
)

# 4 - utility scoring function

In [None]:
def utility_scoring(df):
    """
    To get the utility score used in the challenge.
    """
    from math import sqrt
    u = 0
    Pi = []
    Pis = []
    count_i = len(df['date'].unique())
    for i in list(df['date'].unique()):
        #print("date value= ", i)
        #print(10*"=")
        tmp = df[df['date'] == i][["date","weight","resp","actionv"]]
        tmp["mult"] = tmp['weight'] * tmp['resp'] * tmp['actionv']
        Pi.append(tmp["mult"].sum())
        Pis.append((tmp["mult"].sum())**2)
    
    t =  sum(Pi)/sqrt(sum(Pis)) * sqrt(250/count_i)
    u = min(max(t,0),6)*sum(Pi)
    return u

In [None]:
best_i = 0
best_u = 0
for i in [0.41, 0.45,0.49,0.5, 0.51,0.52, 0.55, 0.6,0.63, 0.65]:
    
    valid['actionv'] = (clf.predict(dvalid).round(4) > i)*1
    u = utility_scoring(valid.to_pandas())
    print(u)
    if u > best_u:
        best_u = u
        best_i = i

In [None]:
best_u

In [None]:
best_i

In [None]:
valid['actionv'] = (clf.predict(dvalid).round() > 0.51)*1

In [None]:
utility_scoring(valid.to_pandas())

In [None]:
valid.action.value_counts()

In [None]:
valid.actionv.value_counts()