In [None]:
# libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from typing import List, Dict, Optional
import numpy as np
from sklearn.model_selection import RepeatedKFold
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import time
import random
import lightgbm as lgb
import gc
import os
from sklearn.preprocessing import LabelEncoder
from numba import jit
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
import janestreet

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
# features = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')

sub = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_sample_submission.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def eval_acc(y_true, y_pred):
    """
    Fast acc eval function for lgb.
    """
    score = sum(y_true == (y_pred > 0.5) * 1) / len(y_true)
#     print('score', score)
    return 'accuracy', score, True

In [None]:
train.head()

In [None]:
train['feature_0'].value_counts()

In [None]:
train['date'].plot(kind='hist')

In [None]:
%%time
train = reduce_mem_usage(train)

In [None]:
train = train.loc[train['weight'] != 0]

In [None]:
train['action'] = ((train['weight'] * train['resp']) > 0) * 1

In [None]:
train['action'].value_counts()

In [None]:
columns = [col for col in train.columns if 'feature_' in col]

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.3,
          "boosting_type": "gbdt",
          "metric": 'accuracy',
         }

In [None]:
scores = []
feature_importance = pd.DataFrame()
models = []

In [None]:
X = train[columns]
y = train['action']

In [None]:
del train

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=False)
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = lgb.LGBMClassifier(**params, n_estimators=1000, n_jobs = 1)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=eval_acc,
            verbose=10, early_stopping_rounds=10)
    score = (model.predict(X_valid) == y_valid).mean()
    
    models.append(model)
    scores.append(score)

    fold_importance = pd.DataFrame()
    fold_importance["feature"] = columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    break

In [None]:
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
feature_importance["importance"] /= 1
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:50].index

best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

plt.figure(figsize=(16, 12));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('LGB Features (avg over folds)');

In [None]:
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df.action = 0
    for model in models:
        sample_prediction_df['action'] += model.predict_proba(test_df[columns])[:, 1]
    sample_prediction_df['action'] /= len(models)
    sample_prediction_df['action'] = (sample_prediction_df['action'] > 0.5) * 1
    env.predict(sample_prediction_df)