In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns


import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [None]:
## Credit:: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
print('-' * 80)
print('df_train')
df_train = import_data("../input/jane-street-market-prediction/train.csv")
print('-' * 80)
print('df_test')
df_test = import_data("../input/jane-street-market-prediction/example_test.csv")
df_feature = pd.read_csv("../input/jane-street-market-prediction/features.csv")
df_sub = pd.read_csv("../input/jane-street-market-prediction/example_sample_submission.csv")

In [None]:
df_train.shape, df_test.shape

In [None]:
pd.set_option('display.max_columns', None) ## To display all the columns
pd.set_option('display.max_rows', None) ## To display all the columns
pd.set_option('display.float_format', lambda x: '%.3f' % x) ## No scientific notation

In [None]:
## Calculating the missing values
# null_values = df_train.columns[df_train.isnull().sum() > 0].sort_values(ascending = False)
# print(null_values)

percent_missing = (df_train.isnull().sum()/df_train.shape[0])*100
missing_value_df = pd.DataFrame({'column_name': df_train.columns, 'percent_missing': percent_missing})
# missing_value_df = missing_value_df.sort_values('percent_missing', inplace = True)
missing_value_df.loc[missing_value_df['percent_missing'] != 0]

In [None]:
# ## Converting the datatypes to save the memory
# df_train.feature_0 = df_train.feature_0.astype(np.int8)
# df_train.date = df_train.date.astype(np.int16)
# df_train.ts_id = df_train.ts_id.astype(np.int32)

# for i in df_train:
#     if df_train[i].dtype == np.float64:
#         if (((df_train[i] < 0.0001) & (df_train[i] > -0.0001)).mean()) < 0.001:
#             df_train[i] = df_train[i].astype(np.float32)
            
# df_train.info()

In [None]:
## Removing almost 5% data
param = ['feature_129', 'feature_127', 'feature_125', 'feature_123', 'feature_121', 'feature_118', 'feature_117',
         'feature_110', 'feature_93', 'feature_59', 'feature_58', 'feature_56', 'feature_55', 'feature_45', 'feature_31',
         'feature_21', 'feature_3']
for i in param:
    df_train = df_train.dropna(axis = 0, subset = [i])
    
df_train.shape

In [None]:
print(df_train.isnull().sum()[df_train.isnull().sum() > 0].sort_values(ascending = False))

## We observe that fields like feature_7, feature_8, feature_17, feature_18, feature_27, feature_28, feature_72, feature_78,
## feature_84, feature_90, feature_96, feature_102, feature_108, feature_114 needs imputation

## Exploring the relationship between features which still have null values
nulls = df_train.isnull().sum()
nulls_lst = list(nulls[(nulls > 0)].index)

df_train[nulls_lst].corr().style.background_gradient(cmap = 'viridis')

We will be dropping multiple columns from above list since they are highly correlated (75%)

In [None]:
df_train = df_train.drop(['feature_8', 'feature_18', 'feature_27', 'feature_72', 'feature_84', 'feature_96', 'feature_102', 'feature_108', 'feature_114'], axis = 1)
df_train.shape

In [None]:
temp = df_train.copy()
temp.isnull().sum()[temp.isnull().sum() > 0]

In [None]:
df_train = temp.copy()

In [None]:
## Updating the column with mean values instead of null values
features = [c for c in df_train.columns if 'feature' in c]
# f_mean = df_train[features[1:]].mean()
# df_train[features[1:]] = df_train[features[1:]].fillna(f_mean)
f_median = df_train.median()
df_train = df_train.fillna(f_median)

## Removing rows where weights = 0
df_train = df_train.query('weight > 0').reset_index(drop = True)

## Adding target column in train dataset
df_train['action'] = np.where(df_train['resp'] > 0, 1, 0)

df_train.shape

In [None]:
df_train.isnull().sum()[df_train.isnull().sum() > 0]

In [None]:
## Splitting the dataset into train & test
X = df_train[features]
Y = df_train['action']

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size = 0.15)

In [None]:
scalar = StandardScaler()
scalar.fit(x_train)
x_train_norm = scalar.transform(x_train)

pca = PCA(n_components = 50).fit(x_train_norm)
x_train_transform = pca.transform(x_train_norm)

x_val_transform = pca.transform(scalar.transform(x_val))

In [None]:
train_final = xgb.DMatrix(x_train_transform, label = y_train)
val_final = xgb.DMatrix(x_val_transform, label = y_val)

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-0)
    gamma = trial.suggest_int('gamma', 0, 10)
    tree_method = trial.suggest_categorical('tree_method', ['auto', 'exact','approx', 'hist', 'gpu_hist'])
    
    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'lr': lr,
        'gamma': gamma,
        'tree_method': tree_method,
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, train_final)
    pred = bst.predict(val_final)
    y_pred = np.rint(pred)
    
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 25, timeout = 600)

In [None]:
trial = study.best_trial
best_params = trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['objective'] = 'binary:logistic'

In [None]:
xgb_classifier = xgb.XGBClassifier(**best_params)
xgb_classifier.fit(x_train_transform, y_train)

In [None]:
# Plot how the best accuracy evolves with number of trials
fig = optuna.visualization.plot_optimization_history(study)
fig.show();

In [None]:
# We can also plot the relative importance of different hyperparameter settings
fig = optuna.visualization.plot_param_importances(study)
fig.show();

In [None]:
# We impute the missing values with the medians
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
for (df_test, sample_prediction_df) in iter_test:
    wt = df_test.iloc[0].weight
    if(wt == 0):
        sample_prediction_df.action = 0 
    else:
        sample_prediction_df.action = xgb_classifier.predict(pca.transform(scalar.transform(fillna_npwhere(df_test[features].values,f_median[features].values))))
    env.predict(sample_prediction_df)