* This competition is a classifier problem
* This notebook is intended to give anyone a chance to copy and build off this approach
* Takes a very simple approach of:
    * Take only important features
    * Tune model parameters
    * Train LGBM
    * Submit

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
#plt.style.use('fivethirtyeight')
import xgboost as xgb
import sklearn

In [None]:
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')

* PCA for identifying important features and throwing away unimportant ones
* Credit to: https://www.kaggle.com/wongguoxuan/eda-pca-xgboost-classifier-for-beginners

In [None]:
train_median = train.median()
train = train.fillna(train_median)

In [None]:
train = train.loc[(train.weight > 0) & (train.date > 85), :]

In [None]:
train['action'] = np.where(train['resp'] < 0, 0, 1)

cols = [col for col in list(train.columns) if 'feature' in col]

x = train.loc[:, cols]
y = train['action']

In [None]:
del train

In [None]:
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

pca = PCA()
comp = pca.fit(x)

# We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance')
sns.despine();

In [None]:
# Using the first 50 principal components, we apply the PCA mapping on both the training and test set
pca = PCA(n_components=50).fit(x)
x = pca.transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }
    
    trn_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_test, label=y_test)
    
    bst = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=-1, early_stopping_rounds = 10) 
    preds = bst.predict(x_test)
    pred_labels = np.rint(preds)
    
    accuracy = sklearn.metrics.accuracy_score(y_test, pred_labels)
    return accuracy

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=25, timeout=1200)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("{}: {}".format(key, value))

In [None]:
best_params = trial.params
best_params['objective'] = 'binary'
best_params['metric'] = 'auc'

trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_test, label=y_test)

lg_model = lgb.train(best_params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=-1, early_stopping_rounds = 10)

In [None]:
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()
from tqdm.notebook import tqdm

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df.action = np.where(
        lg_model.predict(
            pca.transform(
                scaler.transform(
                    fillna_npwhere(
                        test_df[cols].values,
                        train_median[cols].values
                    )
                )
            ),
            num_iteration = lg_model.best_iteration
        ) >=0.5,
        1,
        0
    ).astype(int)
    
    env.predict(sample_prediction_df)