In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

import cudf

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

import pandas as pd
pd.set_option('display.max_columns', 500)

import xgboost as xgb
print("XGBoost version:", xgb.__version__)

import warnings
warnings.filterwarnings("ignore")

# create the environment
import janestreet
print('Creating competition environment...', end='')
env = janestreet.make_env()
print('Finished.')

In [None]:
print('Reading datasets...', end='')

#train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train_cudf = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train = train_cudf.to_pandas()
del train_cudf

#features_meta = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')
features_meta_cudf = cudf.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')
features_meta = features_meta_cudf.to_pandas()
del features_meta_cudf

#example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
#sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print('Finished.')

print(f'train shape: {format(train.shape)}')
print(f'features_meta shape: {format(features_meta.shape)}')
#print(f'example_test shape: {format(example_test.shape)}')
#print(f'sample_prediction_df shape:{format(sample_prediction_df.shape)}')

# display the first rows of the training data
# print(train.head())

In [None]:
train.head()

In [None]:
print('Preprocessing data...', end='')

features = [col for col in list(train.columns) if 'feature' in col]

# # adjust features to use top half based on importance
# features = ['feature_43', 'feature_42', 'feature_45', 'feature_41', 'feature_44', 'feature_63', 'feature_61', 'feature_69', 'feature_6', 'feature_64', 'feature_5', 'feature_62', 'feature_7', 'feature_11', 'feature_20', 'feature_39', 'feature_60', 'feature_83', 'feature_3', 'feature_37', 'feature_1', 'feature_40', 'feature_4', 'feature_38', 'feature_27', 'feature_77', 'feature_119', 'feature_28', 'feature_120', 'feature_68', 'feature_95', 'feature_90', 'feature_66', 'feature_55', 'feature_121', 'feature_89', 'feature_84', 'feature_107', 'feature_114', 'feature_113', 'feature_71', 'feature_8', 'feature_124', 'feature_49', 'feature_101', 'feature_125', 'feature_102', 'feature_78', 'feature_57', 'feature_67', 'feature_65', 'feature_108', 'feature_70', 'feature_31', 'feature_48', 'feature_126', 'feature_18', 'feature_26', 'feature_96', 'feature_86', 'feature_116', 'feature_127', 'feature_22', 'feature_92', 'feature_51', 'feature_58', 'feature_12', 'feature_33', 'feature_53', 'feature_17', 'feature_104', 'feature_24', 'feature_110', 'feature_72', 'feature_36', 'feature_21', 'feature_35', 'feature_32', 'feature_25', 'feature_59', 'feature_34', 'feature_2', 'feature_93', 'feature_10', 'feature_117', 'feature_87', 'feature_47', 'feature_128', 'feature_98', 'feature_80', 'feature_50', 'feature_54', 'feature_79', 'feature_129', 'feature_9', 'feature_23', 'feature_19', 'feature_111', 'feature_115', 'feature_56', 'feature_30', 'feature_73', 'feature_122', 'feature_105', 'weight', 'feature_74', 'feature_14', 'feature_123', 'feature_29', 'feature_109', 'feature_82', 'feature_112', 'feature_76', 'feature_16', 'feature_99', 'feature_88', 'feature_85', 'feature_106', 'feature_118', 'feature_91', 'feature_46', 'feature_81', 'feature_75', 'feature_94', 'feature_52', 'feature_13', 'feature_103', 'feature_15', 'feature_100', 'feature_97', 'feature_0']
# features = features[:65]

# only train on rows with positive weights
train = train[train['weight'] != 0]

# binarize the target
train['action'] = (train['resp'].values > 0).astype(int)

#train = train.fillna(-99999)
f_mean = train.mean()
train.fillna(f_mean)

# split data for training and free data space usage to prevent exceeding maximum allowed
X_train = train.loc[:, features]
y_train = train.loc[:, 'action']
del train

print('Finished.')

In [None]:
#   X G B o o s t
#
# create, configure and train the classifier using GPU
# TODO: other tree_methods ?

print('Creating classifier...', end='')
clf = xgb.XGBClassifier(
    n_estimators=5000,
            max_depth=12,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.4,
            missing=-1,
            eval_metric='auc',
            # USE CPU
            #nthread=4,
            #tree_method='hist'
            # USE GPU
            tree_method='gpu_hist' 
#     n_estimators=400,
#     max_depth=7,
#     eta=0.5, # learning_rate
#     missing=None,
#     random_state=42,
#     tree_method='gpu_hist',
#     subsample=0.8,
#     colsample_bytree=1,
#     #sampling_method='gradient_based',
#     #eval_metric='logloss',
#     verbosity=2   # info
    
)
print('Finished.')

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import eli5

import lightgbm as lgbm
import xgboost as xgb

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [None]:
Xtrain, Xval, Ztrain, Zval = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
train_set = lgbm.Dataset(Xtrain, Ztrain, silent=False)
valid_set = lgbm.Dataset(Xval, Zval, silent=False)

In [None]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,        
    }

modelL = lgbm.train(params, train_set = train_set, num_boost_round=1000,
                   early_stopping_rounds=50, verbose_eval=10, valid_sets=valid_set)

In [None]:
fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
lgbm.plot_importance(modelL,ax = axes,height = 0.5)
plt.show();plt.close()

In [None]:
feature_score = pd.DataFrame(features, columns = ['feature']) 
feature_score['score_lgb'] = modelL.feature_importance()

In [None]:
feature_score

In [None]:
# perform test and create submissions file
print('Creating submissions file...', end='')
rcount = 0
for (test_df, prediction_df) in env.iter_test():
    X_test = test_df.loc[:, features]
    y_preds = modelL.predict(X_test)
    prediction_df.action = y_preds
    env.predict(prediction_df)
    rcount += len(test_df.index)
print(f'Finished processing {rcount} rows.')

In [None]:
fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
xgb.plot_importance(modelx,ax = axes,height = 0.5)
plt.show();plt.close()

In [None]:
# XGBoost
# train
print('Training classifier...', end='')
%time clf.fit(X_train, y_train)
print('Finished.')

In [None]:
xg_df = prediction_df.copy()

In [None]:
# perform test and create submissions file
print('Creating submissions file...', end='')
rcount = 0
for (test_df, xg_df) in env.iter_test():
    X_test = test_df.loc[:, features]
    y_preds = modelL.predict(X_test)
    prediction_df.action = y_preds
    env.predict(prediction_df)
    rcount += len(test_df.index)
print(f'Finished processing {rcount} rows.')

In [None]:
# #   E X P E R I M E N T S

# # adjust parameters to test fit

# # eval_metric = ['logloss', 'error', 'error@0.6']
# # sampling_method=['uniform', 'gradient_based']
# # param_grid = dict(eval_metric=eval_metric, sampling_method=sampling_method)

# #kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# grid_search = GridSearchCV(clf, param_grid, scoring="neg_log_loss", n_jobs=1, cv=10, verbose=3, refit=False)
# grid_result = grid_search.fit(X_train, y_train)

# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
# 	print("%f (%f) with: %r" % (mean, stdev, param))
# # # plot results
# # scores = np.array(means).reshape(len(eval_metric), len(sampling_method))
# # for i, value in enumerate(sampling_method):
# #     plt.plot(eval_metric, scores[i], label='sampling_method: ' + str(value))
# # plt.legend()
# # plt.xlabel('sampling_method')
# # plt.ylabel('Log Loss')
# # plt.savefig('best_params.png')m

In [None]:
# # available importance_types = [‘weight’, ‘gain’, ‘cover’, ‘total_gain’, ‘total_cover’]
# f = 'gain'
# top = len(features)  # or set constant to view only the most important
# fdata = clf.get_booster().get_score(importance_type= f)
# fsorted = dict(sorted(fdata.items(), key=lambda item: item[1], reverse=True))
# plt.figure(figsize=(10,top * 0.25))
# plt.barh(range(top), list(fsorted.values())[:top], align='center')
# plt.yticks(range(top), list(fsorted.keys())[:top])
# plt.show()
# #print(list(fsorted.keys()))
# #print(fsorted)