In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! conda install -c rapidsai cudf

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as s
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
import joblib
from dask.distributed import Client
import pickle
import time
import warnings
warnings.filterwarnings("ignore")

In [None]:
%%time
train  = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
print('train shape is {}'.format(train.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

### Missing Values Count

In [None]:
missing_values_count = train.isnull().sum()
print (missing_values_count)
total_cells = np.product(train.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

### Fill Missing Values

In [None]:
%%time
train = train.fillna(train.mean())

### Create Target Column

In [None]:
%%time
train['action'] = ((train['resp'] > 0) & (train['weight'] > 0)).astype('int')

### Drop unwanted Columns

In [None]:
%%time
train = train.drop(labels='date',axis=1)
train = train.drop(labels='weight',axis=1)
train = train.drop(labels='resp_1',axis=1)
train = train.drop(labels='resp_2',axis=1)
train = train.drop(labels='resp_4',axis=1)
train = train.drop(labels='resp',axis=1)

In [None]:
X = train.drop(labels='action',axis=1)
y = train['action']

### Carry out Feature Scaling

In [None]:
%%time
X = StandardScaler().fit_transform(X)

### Identify Correlation between features

In [None]:
%%time
# Correlation matrix
corrmat = train.corr()
fig = plt.figure(figsize = (16, 16))

s.heatmap(corrmat, vmax = 1, square = True,annot=True,vmin=-1)
plt.show()

### Carry out Data Spliting

In [None]:
%%time
np.random.seed(10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)

In [None]:
del X, y, train, features, example_test, sample_prediction_df

## Training

In [None]:
%%time     
def fitXgboostModel(X_train, X_test, y_train, y_test,algo_name,cv):
    classifier = XGBClassifier(
    n_estimators=500,
    max_depth=11,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    missing=-999,
    random_state=2020,
    tree_method='gpu_hist')
    xgboost_model = classifier.fit(X_train,y_train)
    pred = xgboost_model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    print(pred)
    pickle.dump(xgboost_model,open(algo_name,'wb'))

    print('Classification Report :',classification_report(y_test,pred))
    print('Accuracy Score : ' + str(accuracy_score(y_test,pred)))
    print('Confusion Matrix : \n', cm)


    #def featureImportance():
    #%%Feature importances

    plt.figure(figsize=(12,12))
    plt.barh(range(len(xgboost_model.feature_importances_)), 
             xgboost_model.feature_importances_)
    plt.show()

In [None]:
%%time
fitXgboostModel(X_train, X_test, y_train, y_test,'xgboost_norm_3',cv=5)