In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
# from sklearn.inspection import plot_partial_dependence
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

The dataset is from kaggle [data](https://www.kaggle.com/prachi13/customer-analytics)   
Some of the columns:  
* Customer care calls: The number of calls made from enquiry for enquiry of the shipment.  
* Reached on time: It is the target variable, where 1 Indicates that the product has NOT reached on time and 0 indicates it has reached on time.  

In [None]:
df = pd.read_csv('../input/customer-analytics/Train.csv')
# df.head()

In [None]:
df.info()

We can see there are no missing values. The datatypes are all good. 

In [None]:
df.drop(df.columns[[0]], axis=1, inplace=True)

cat_features = df.select_dtypes(include='object') 
cat_features = pd.concat([cat_features, df[['Customer_rating', 'Reached.on.Time_Y.N']]], axis = 1)
cat_vals = {}
print('Unique values for categorical features:\n')
for column in cat_features:
    cat_vals[column]=cat_features[column].unique()
    print(len(cat_vals[column]), 'unique values of ', column, ':' , cat_vals[column])

The values are legit. 

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15,10))
sns.histplot(ax=axes[0, 0], data=df, x='Customer_care_calls', hue='Reached.on.Time_Y.N', multiple="stack")
sns.histplot(ax=axes[0, 1], data=df, x='Customer_rating', hue='Reached.on.Time_Y.N', multiple="stack")
sns.histplot(ax=axes[0, 2], data=df, x='Cost_of_the_Product', hue='Reached.on.Time_Y.N', multiple="stack")
sns.histplot(ax=axes[1, 0], data=df, x='Prior_purchases', hue='Reached.on.Time_Y.N', multiple="stack")
sns.histplot(ax=axes[1, 1], data=df, x='Discount_offered', hue='Reached.on.Time_Y.N', multiple="stack")
sns.histplot(ax=axes[1, 2], data=df, x='Weight_in_gms', hue='Reached.on.Time_Y.N', multiple="stack")

There seem to be some interesting patterns in the graph above:   
1. All of the products reached on time have less than 10% discount, compared to 0-65% for those not reached on time.  
2. None of the products weighing 2-4kg reached on time. And the cost of these products are over 180 dollars.   

In [None]:
sns.set()
# cols = ['Reached.on.Time_Y.N', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']
sns.pairplot(df, hue='Reached.on.Time_Y.N',height=2)
plt.show();

There seem to be some interesting patterns in the graph above:   
1. Products that have the most customer care calls are relative expensive(mostly over 230 dollars), light(0-2kg), also with less discount (mostly below 10%).  
2. Products weighing 4-6kg only got less than 10% discount.   

In [None]:
# encode the categorical features
Warehouse_block_enc = df[['Warehouse_block']].replace({'A':0, 'B':1, 'C':2, 'D':3, 'F':4})
# print(Warehouse_block_enc)
Mode_of_Shipment_enc = df[['Mode_of_Shipment']].replace({'Flight':0, 'Ship':1, 'Road':2})
Product_importance_enc = df[['Product_importance']].replace({'low':0, 'medium':1, 'high':2})
Gender_enc = df[['Gender']].replace({'F':0, 'M':1})

df = pd.concat([df.select_dtypes(exclude='object'), Warehouse_block_enc, Mode_of_Shipment_enc, Product_importance_enc], axis=1)
# df.head()
df.describe().T

In [None]:
# If there are too many subplots, use necessary scrollbar to make sure the plots are large enough
# from IPython.display import display, HTML
# CSS = """div.output_area img {max-width:None !important;max-height: None !important";}"""
# display(HTML('<style>{}</style>'.format(CSS)))

# sns.set()
# sns.pairplot(df, height=2)
# plt.show();

Nothing seems interesting from the additional three categorical features. 

In [None]:
corrmatrix = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
ax = sns.heatmap(corrmatrix, vmax=.8, square=True, annot=True, cmap="YlGnBu")

In [None]:
X = df.drop(['Reached.on.Time_Y.N'], axis=1)
y = df['Reached.on.Time_Y.N']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

param_grid = {'max_depth':range(3,10),'criterion':['gini','entropy']}
rf = RandomForestClassifier(random_state=4)
model_rf = GridSearchCV(rf, param_grid=param_grid)
model_rf.fit(X_train, y_train)
pred_test = model_rf.predict(X_test)
print('Classification Report of RandomForestClassifier: \n', classification_report(y_test, pred_test))
scores = cross_val_score(model_rf, X, y, scoring='roc_auc')
print ('cross validation score of RandomForestClassifier: %.8f'%scores.mean())

In [None]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)
pred_test = model_rf.predict(X_test)
print('Classification Report of XGBClassifier: \n', classification_report(y_test, pred_test))
scores = cross_val_score(model_xgb, X, y, scoring='roc_auc')
print ('cross validation score of XGBClassifier: %.8f'%scores.mean())

In [None]:
plot_importance(model_xgb)