In [1]:
# importing required libraries
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import matplotlib as plt 
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import xgboost as xgb


In [2]:
#  Reading the train and test datasets 
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
train.columns

Index(['INCIDENT_ID', 'DATE', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7',
       'X_8', 'X_9', 'X_10', 'X_11', 'X_12', 'X_13', 'X_14', 'X_15',
       'MULTIPLE_OFFENSE'],
      dtype='object')

In [4]:
col = 'DATE'

#sort dataframe based on time pandas python: https://stackoverflow.com/a/49702492/4084039
#sorting by date for time based splitting 
train[col] = pd.to_datetime(train[col])
train.sort_values(by=[col], inplace=True)

train.head(2)

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,MULTIPLE_OFFENSE
19022,CR_1141,1991-01-01,0,22,22,7,3,8,3,7,2,2,0,2.0,72,142,34,1
13104,CR_2248,1991-01-02,0,33,32,2,1,7,1,1,6,1,249,1.0,72,142,34,1


In [5]:
#sort dataframe based on time pandas python: https://stackoverflow.com/a/49702492/4084039
#sorting by date for time based splitting 
test[col] = pd.to_datetime(test[col])
test.sort_values(by=[col], inplace=True)

test.head(2)

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
3319,CR_3153,1991-01-02,1,36,34,2,1,15,10,1,6,1,249,1.0,92,93,34
6576,CR_1413,1991-01-03,0,25,25,9,0,3,5,0,5,1,316,1.0,18,142,34


In [6]:
# count of values for the target variable 
# datasets are highly unbalanced 
train['MULTIPLE_OFFENSE'].value_counts()

1    22788
0     1068
Name: MULTIPLE_OFFENSE, dtype: int64

In [7]:
# went_on_backorder data sets are highly unbalanced as 99.33% bickorder = no  and only 0.67% = yes 
print("MULTIPLE_OFFENSE = YES: {}% and MULTIPLE_OFFENSE = NO: {}%".format((22788/23856)*100, ((1068/23856)*100)))

MULTIPLE_OFFENSE = YES: 95.52313883299799% and MULTIPLE_OFFENSE = NO: 4.476861167002013%


In [8]:
# finding the information about the train data
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23856 entries, 19022 to 14034
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   INCIDENT_ID       23856 non-null  object        
 1   DATE              23856 non-null  datetime64[ns]
 2   X_1               23856 non-null  int64         
 3   X_2               23856 non-null  int64         
 4   X_3               23856 non-null  int64         
 5   X_4               23856 non-null  int64         
 6   X_5               23856 non-null  int64         
 7   X_6               23856 non-null  int64         
 8   X_7               23856 non-null  int64         
 9   X_8               23856 non-null  int64         
 10  X_9               23856 non-null  int64         
 11  X_10              23856 non-null  int64         
 12  X_11              23856 non-null  int64         
 13  X_12              23674 non-null  float64       
 14  X_13              

In [9]:
# finding the information about the test data
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15903 entries, 3319 to 2057
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   INCIDENT_ID  15903 non-null  object        
 1   DATE         15903 non-null  datetime64[ns]
 2   X_1          15903 non-null  int64         
 3   X_2          15903 non-null  int64         
 4   X_3          15903 non-null  int64         
 5   X_4          15903 non-null  int64         
 6   X_5          15903 non-null  int64         
 7   X_6          15903 non-null  int64         
 8   X_7          15903 non-null  int64         
 9   X_8          15903 non-null  int64         
 10  X_9          15903 non-null  int64         
 11  X_10         15903 non-null  int64         
 12  X_11         15903 non-null  int64         
 13  X_12         15776 non-null  float64       
 14  X_13         15903 non-null  int64         
 15  X_14         15903 non-null  int64         
 16  X_

In [10]:
test_incidentID = test['INCIDENT_ID']

In [11]:
# Dropping Unnecessary columns from the datasets
train.drop(['INCIDENT_ID','DATE'],axis=1,inplace=True)
test.drop(['INCIDENT_ID','DATE'],axis=1,inplace=True)

In [12]:
train.describe()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,MULTIPLE_OFFENSE
count,23856.0,23856.0,23856.0,23856.0,23856.0,23856.0,23856.0,23856.0,23856.0,23856.0,23856.0,23674.0,23856.0,23856.0,23856.0,23856.0
mean,0.483778,24.791206,24.63745,4.276744,2.455609,6.154175,4.876509,0.97246,4.924128,1.244802,206.954519,0.974064,85.237383,72.674296,33.464747,0.955231
std,1.439738,15.240231,15.135093,2.944672,1.963095,4.471756,3.881931,1.453144,1.362625,1.119301,93.033348,1.167725,27.597226,43.29732,8.386834,0.2068
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,7.0,8.0,2.0,1.0,3.0,2.0,0.0,5.0,1.0,174.0,1.0,72.0,29.0,34.0,1.0
50%,0.0,24.0,24.0,4.0,3.0,5.0,4.0,1.0,5.0,1.0,249.0,1.0,98.0,62.0,34.0,1.0
75%,0.0,36.0,35.0,6.0,5.0,8.0,7.0,1.0,6.0,1.0,249.0,1.0,103.0,107.0,34.0,1.0
max,7.0,52.0,52.0,10.0,5.0,19.0,18.0,99.0,6.0,90.0,332.0,90.0,116.0,142.0,50.0,1.0


In [13]:
# assigning MULTIPLE_OFFENSE to y and drop from the dataset
y = train["MULTIPLE_OFFENSE"]
train.drop(['MULTIPLE_OFFENSE'],axis=1,inplace=True)
train.head(1)

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
19022,0,22,22,7,3,8,3,7,2,2,0,2.0,72,142,34


In [14]:
x = train

In [15]:
# splitting the datasets with 80% for training and 20% testing , used stratify = y as data is highly inbalance  
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=0.2, stratify=y)

In [16]:
# Standardizing the features by centering and scaling
scalar = StandardScaler()
scalar.fit(X_train)
train = scalar.transform(X_train)
test = scalar.transform(test)
cv = scalar.transform(X_cv)
train = pd.DataFrame(train)
test = pd.DataFrame(test)
cv = pd.DataFrame(cv)

In [17]:
# converting the train , cv , and test data to the XGBoost format
dtrain = xgb.DMatrix(train,label=y_train)
dcv = xgb.DMatrix(cv,label=y_cv)
dtest = xgb.DMatrix(test)

In [34]:
param = {
    'max_depth': 2,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softprob', 
    'num_class' : 2,
    'min_child_weight' : 3
    }  # the number of classes that exist in this datset
num_round = 240

In [35]:
# Applying the XGBoost on train and cross-validation data 
%time bst = xgb.train(param, dtrain, num_round)

CPU times: user 7.8 s, sys: 182 ms, total: 7.98 s
Wall time: 2.35 s


In [36]:
# predicting the MULTIPLE_OFFENSE for the train and cross-validation data
train_pred = np.asarray([np.argmax(pred) for pred in bst.predict(dtrain)])
cv_pred = np.asarray([np.argmax(pred) for pred in bst.predict(dcv)])

In [37]:
# finding the recall score for train and test datasets 
from sklearn.metrics import recall_score
train_score = recall_score(y_train,train_pred)
cv_score = recall_score(y_cv,cv_pred)

In [38]:
print("Recall score for the training datasets {}".format(train_score))

Recall score for the training datasets 1.0


In [39]:
print("Recall score for the cross-validation datasets {}".format(cv_score))

Recall score for the cross-validation datasets 0.9997806055287407


In [40]:
# predicting the MULTIPLE_OFFENSE for the test data 
test_pred = np.asarray([np.argmax(pred) for pred in bst.predict(dtest)])

In [41]:
# creating the submission.csv file to submit 
sub = pd.DataFrame()
sub['INCIDENT_ID'] = test_incidentID
sub['MULTIPLE_OFFENSE'] = test_pred
sub.to_csv('submission.csv',index=False)