In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read dataset
data = pd.read_csv('../input/customer-analytics/Train.csv')

In [None]:
# Eyeball the data
preprocessData = data.copy()
data.head()

In [None]:
# there are 12 colunms and 10999 datapoints
data.shape

In [None]:
# No nullable datapoints
data.info()

In [None]:
data.describe()

In [None]:
def takeDummy(col):
    return pd.DataFrame(pd.get_dummies(data[col],prefix=col, prefix_sep=':',drop_first=True))

In [None]:
# get the Dummies for 
#     Warehouse_block  
#     Mode_of_Shipment
#     Product_importance
#     Gender
dataDummies = pd.DataFrame()
dataDummies = pd.concat([dataDummies,takeDummy('Warehouse_block')], axis=1)
dataDummies = pd.concat([dataDummies,takeDummy('Mode_of_Shipment')], axis=1)
dataDummies = pd.concat([dataDummies,takeDummy('Product_importance')], axis=1)
dataDummies = pd.concat([dataDummies,takeDummy('Gender')], axis=1)
dataDummies.head()

In [None]:
dataDummies.columns
preprocessData.drop(columns=['Warehouse_block','Mode_of_Shipment','Product_importance','Gender'],inplace =True)
preprocessData = pd.concat([preprocessData,dataDummies],axis=1)

In [None]:
preprocessData.head()

In [None]:
# The distribution of male and female is almost same
plt.pie(preprocessData['Gender:M'].value_counts(),labels=['female','male'],autopct='%.2f%%')

In [None]:
# On dtime delivery datapoints are slighyly more than the counter parts
plt.pie(preprocessData['Reached.on.Time_Y.N'].value_counts(),labels=['Y','N'],autopct='%.2f%%')

In [None]:
# From corelation map we find that the Dependent variable is not dependent on 
#    Warehouse_block
#    Mode_of_Shipment
#    Product_importance
#    Gender
plt.figure(figsize=(20,20))
sns.heatmap(preprocessData.corr(),center=0, cmap='RdBu', square=True, annot=True)

In [None]:
preprocessData.columns

In [None]:
# Warehouse_block,Mode_of_Shipment,Product_importance, Gender and ID have less coorlation to On time delivery. So omitting those fields for our modelling .
datamodel = preprocessData[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms','Reached.on.Time_Y.N']]

In [None]:
# Take independent and dependent variables
X = datamodel.drop(columns=['Reached.on.Time_Y.N'])
y = datamodel['Reached.on.Time_Y.N']

In [None]:
# 70:30 test train split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=123)

In [None]:
#  Initialize Extreme Gradient Boost Classifier
model = XGBClassifier(learning_rate=0.000001,max_depth=6)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Get accuracy score on Test sets
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100))

In [None]:
# Confiem Accuracy with k-fold 
kfold = KFold(n_splits=10)
results = cross_val_score(model, X, y, cv=kfold)
print("Highest possible Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
# From the below plot we can see that the FP is less and FN is more, ie Type 1 error is less. 
#   ie: the model predicts many ontime delivery as Delayed delivery , 
plt.figure(figsize=(5,3))
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,cmap='BuGn',fmt="d")

In [None]:
# ROC AUC curve plot
print("train auc is",roc_auc_score(y_test,y_pred))
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
metrics
# method to plot
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0,1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
print('The F1 score of the XGB model is  ==> {} '.format(f1_score(y_test, y_pred)))

In [None]:
# we observe that for Reachec on time prediction has high presion and not reaching on time has high recall. 
print(classification_report(y_test,y_pred))  