# packages

In [None]:
import numpy as np
import pandas as pd
import datatable as dt

from lightgbm import LGBMClassifier
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import gc
import janestreet
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

# Defining functions

In [None]:
def print_score(pred, y_test):
    print('accuracy: {:.4}'.format(accuracy_score(pred, y_test)))
    print('precision: {:.4}'.format(precision_score(pred, y_test)))
    print('recall: {:.4}'.format(recall_score(pred, y_test)))
    print('f1: {:.4}'.format(f1_score(pred, y_test)))
    print('auc: {:.4}'.format(roc_auc_score(pred, y_test)))

def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
#                 reducing float16 for calculating numpy.nanmean
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

def precision_recall_curve_plot(y_test, pred_proba_c1):
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
    
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')
    
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

# Loading files

In [None]:
path = '/kaggle/input/jane-street-market-prediction/'
train = dt.fread(path+'train.csv')
train = train.to_pandas()

In [None]:
train.info()

In [None]:
train.head()

# Preprocessing data

In [None]:
train = reduce_memory_usage(train)
train.info()

In [None]:
len(train), len(train[train['weight']>0])

In [None]:
train['action']=0
train.loc[train['resp']>0.0,'action']=1
features = ['feature_{}'.format(i) for i in range(0,130)]

In [None]:
train

In [None]:
train_df = train[train['date']>85]
del train

In [None]:
train_data = train_df[features]
train_target = train_df['action']

In [None]:
print(train_data.shape, train_target.shape)

In [None]:
train_data.head()

In [None]:
train_target.value_counts()

# Modeling

### model selection

In [None]:
lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_leaves': 300,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
#     'reg_lambda': 0.3,
#     'reg_alpha': 0.3
}

In [None]:
lgb_clf = LGBMClassifier(n_jobs=-1)
lgb_clf.fit(train_data, train_target)

In [None]:
ftr_importances_values = lgb_clf.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=train_data.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:30]

plt.figure(figsize=(8,6))
plt.title('Feature importances top 30')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

# Submission

In [None]:
env = janestreet.make_env() # initialize the environment 
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in iter_test: 
    sample_prediction_df.action = lgb_clf.predict(test_df[features])
    env.predict(sample_prediction_df)