# Import packages

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import os
import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
import gc
import cudf

# Load train Data 

In [None]:
%%time
data = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
print ("Data is loaded!")

In [None]:
data = data.fillna(-1)
features = [c for c in data.columns if 'feature' in c]

data = data.astype('float32')
data = data[(data.weight != 0)]
data['action'] = (data['resp']>0)*1

 
train = data[(data.date <= 450)]

valid = data[data.date > 450]
weights = train["weight"]

X_train = train.loc[:, features]
y_train = train.loc[:, 'action']

X_valid = valid.loc[:, features]
y_valid = valid.loc[:, 'action']
del train, data
gc.collect()

In [None]:
X_train.head()

# Lightgbm Model

In [None]:
train_data = lgb.Dataset(X_train.to_pandas(), label=y_train.to_pandas(),weight = weights.to_pandas())
valid_data = lgb.Dataset(X_valid.to_pandas(), label=y_valid.to_pandas())

In [None]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'max_depth' : 9,
    'feature_fraction': 0.8,
    'learning_rate': 0.05,
    'feature_fraction_seed': 1991,
    'device_type': "gpu", # to use gpu
}


In [None]:
model = lgb.train(parameters,
                  train_data,
                  valid_sets=[train_data,valid_data],
                  num_boost_round=1500,
                  early_stopping_rounds=100,
                  verbose_eval= 30)

# Local validation using utility score

In [None]:
# from https://www.kaggle.com/c/jane-street-market-prediction/discussion/200207
from math import sqrt

def utility_score(df):
    """Calculate utility score of a dataframe according to formulas defined at
    https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
    """

    df['p'] = df['weight']  * df['resp'] * df['actionv']
    p_i = df.set_index('date')['p'].groupby('date').sum()
    t = (p_i.sum() / sqrt((p_i**2).sum())) * (sqrt(250 / p_i.index.size))
    return min(max(t, 0), 6) * p_i.sum()

In [None]:
best_i = 0
best_u = 0
for i in [0.41, 0.45,0.49,0.5, 0.51,0.52, 0.55, 0.6,0.63, 0.65]:
    
    valid['actionv'] = (model.predict(X_valid.to_pandas()).round(4) > i)*1
    u = utility_score(valid.to_pandas())
    print(u)
    if u > best_u:
        best_u = u
        best_i = i

# Feature importance plot

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(),X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(30)) # only TOp 50 delete .head(50) to see all the features
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
plt.savefig('lgbm_importances-01.png')