# Module

In [None]:
import optuna
import janestreet
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Dataset

In [None]:
%%time

train  = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

print(train.shape)
train.head()

# Preprocess

In [None]:
train = train[train['weight'] != 0]

# train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')
train['action'] = (train['resp'] > 0) * 1

In [None]:
features = train.columns[train.columns.str.contains('feature')]
target = 'action'

print(len(features))
print(features[:2], '...', features[128:])

In [None]:
train = train.fillna(0.5)

print(train.isnull().sum().sum())

# Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train[features], train[target], random_state=666, test_size=0.2)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
'''
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    tree_method = 'gpu_hist'
    random_state = 666
    
    model = lgb.LGBMClassifier(
        num_leaves = num_leaves,
        n_estimators = n_estimators, 
        max_depth = max_depth, 
        min_child_samples = min_child_samples, 
        min_data_in_leaf = min_data_in_leaf,
        learning_rate = learning_rate,
        bagging_fraction = bagging_fraction,
        feature_fraction = feature_fraction,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        tree_method ='gpu_hist',
        random_state = 666)
    
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    score = roc_auc_score(y_test.values, model.predict_proba(X_test)[:,1])
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
params = study.best_params

print(params)

'''

In [None]:
params = {'num_leaves': 30,
          'n_estimators': 275,
          'max_depth': 8,
          'min_child_samples': 171,
          'learning_rate': 0.5500902321095997,
          'min_data_in_leaf': 37,
          'bagging_fraction': 0.7341205951502766,
          'feature_fraction': 0.983688510908062,
          'subsample': 0.5509996432272407,
          'colsample_bytree': 0.11579699523545023,
          'tree_method': 'gpu_hist',
          'random_state': 666}

In [None]:
%%time

cls = lgb.LGBMClassifier(**params)
cls.fit(train[features], train[target])

y_proba = cls.predict_proba(X_test)[:,1]

print('Score: ', roc_auc_score(y_test, y_proba))

In [None]:
fig,ax = plt.subplots(figsize=(30,30))
lgb.plot_importance(cls, ax=ax,importance_type='gain',max_num_features=130)
plt.show()

# Submit

In [None]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
'''
%%time

for (test, sample_prediction) in iter_test:
    test = test.fillna(0.5)
    sample_prediction['action'] = cls.predict(test[features])
    env.predict(sample_prediction)
    
'''

In [None]:
%%time

for (test, sample_prediction) in iter_test:
    test = test.fillna(0.5)
    
    if test['weight'].item() > 0:
        sample_prediction['action'] = cls.predict(test[features])
    else:
        sample_prediction['action'] = 0
    
    env.predict(sample_prediction)