In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
custom_colors = ["#023e8a", "#0096c7","#90e0ef","#ff5400","#ffbd00"]
customPalette = sns.set_palette(sns.color_palette(custom_colors))

In [None]:
df=pd.read_csv("../input/customer-analytics/Train.csv")
df.head()

In [None]:
df.loc[df.ID.duplicated(),:]

In [None]:
df.info()

In [None]:
#df['Reached.on.Time_Y.N']=df['Reached.on.Time_Y.N'].astype('object')

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()
df.isnull().sum()

In [None]:
#Let's also drop ID column
df=df.drop(['ID'],axis=1)

In [None]:
object1 = df.select_dtypes(include='object').columns
float1 = df.select_dtypes(exclude = 'object').columns

In [None]:
i=0
for col in object1:
    x=df[col].unique()
    print(object1[i],x)
    i +=1

In [None]:
#Just assigning names to binary values
def Reached_yn(i):
    if i==0:
        return "On Time"
    return "Delayed"
df['Reached']=df['Reached.on.Time_Y.N'].apply(Reached_yn)
df['Reached'].unique() 

In [None]:
df.head()

In [None]:
df.describe()

It's observed that Prior_purchases and Discount_offered	are drastically changes after 75th percentile. Let's analyze more.

In [None]:
plt.figure(figsize=(10,5))
boxplot = df.boxplot(column=['Prior_purchases','Discount_offered'])

Prior purchase and discount offered has some outliers whi needs to be treated so that model doesn't get affected.

In [None]:
print(df[['Prior_purchases','Discount_offered']].quantile(0.75))
print(df[['Prior_purchases','Discount_offered']].quantile(0.80))
print(df[['Prior_purchases','Discount_offered']].quantile(0.85))
print(df[['Prior_purchases','Discount_offered']].quantile(0.90))
print(df[['Prior_purchases','Discount_offered']].quantile(0.95))
print(df[['Prior_purchases','Discount_offered']].quantile(0.99))

In [None]:
print(len(df[df['Prior_purchases']>=10]))
print(len(df[df['Discount_offered']>=20]))

Since the count of outliers very less for prior purchase, removing them straight from data. Discount offered having around 15-20% outliers. this needs some treatment. 
Considering discount offered 75th percentile value as max
and replacing higher discounts with it in below code. We could also apply mean/meadian to treat those but let's assume that these are higher discount orders and considering accordingly

In [None]:
#df['Discount_offered'].filter(lambda x: '13' if x>=13 else x)
df['Discount_offered'].mask(df['Discount_offered']>=13,13,inplace=True)
df

In [None]:
plt.figure(figsize=(20,6))
corr=df.corr()
sns.heatmap(corr,annot=True,cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5)

plt.title('Correlation between Fields', fontsize=20,font="Serif")
plt.show()

It's clear that there are no strong correlation between variables

In [None]:
#Categorical Data Analysis
Data = df[['Warehouse_block','Product_importance','Mode_of_Shipment','Reached.on.Time_Y.N','Gender']]
i = 1
plt.figure(figsize=(20,12))
for col in Data:
    plt.subplot(3,6,i)
    x=Data[col].value_counts()
    plot = x.plot.pie(fontsize=10, autopct='%1.0f%%')
    i +=1

Observations: 
    1. F blcok delivers almost 1/3 of total orders
    2. 91% products are medium to low importance. 
    2. Ships are delivering major chunk of products.
    4. almost 60% products are getting delayed
    5. No trend found in gender purchasing scenario

In [None]:
df.groupby(['Mode_of_Shipment','Reached'])['Cost_of_the_Product'].sum()

In [None]:
# creating pivot table to anaylze more
shipment=df.loc[:,['Mode_of_Shipment','Reached','Cost_of_the_Product','Weight_in_gms']]
shipment
table = pd.pivot_table(shipment, index=['Mode_of_Shipment','Reached']
        ,values=['Reached','Cost_of_the_Product'
                 ,'Weight_in_gms']
                 ,aggfunc = {'Reached':np.size
                             ,'Cost_of_the_Product':np.sum
                             ,'Weight_in_gms':np.sum}     
                      )

table['% Reached'] = (table.Reached / table.Reached.sum() * 100).astype(str) + '%'
table['% Weight'] = (table.Weight_in_gms / table.Weight_in_gms.sum() * 100).astype(str) + '%'
table

Ships are causing more delay in delivering products but it's also the only mode that is delivery very high weighted products having almost 36.3+31.4= 67.7 %  of weight out of total and 7462 products out of 10999 products

In [None]:
#Let's also see how much of important products are getting delivered by all modes
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(12,4))
sns.countplot(x='Mode_of_Shipment',hue='Product_importance',data=df,ax=ax[0])
#Which warehouse block is delivering more products on time and which one is delaying 
sns.countplot(x='Warehouse_block',hue='Reached',data=df,ax=ax[1])
sns.countplot(data=df,x='Cost_of_the_Product',hue='Gender',ax=ax[2],palette='tab10')

In [None]:
i = 1
plt.figure(figsize = (13,4))
for col in object1:
    plt.subplot(1,4,i)
    sns.countplot(x = 'Reached.on.Time_Y.N',hue = df[col] , data = df)
    i +=1

In [None]:
i = 1
plt.figure(figsize = (13,4))
for col in object1:
    plt.subplot(1,4,i)
    sns.countplot(x = 'Warehouse_block',hue = df[col] , data = df)
    i +=1

These results indicates that warehouse block F has most contribution in delivering products, 
it seems to be a bigger warehouse than others maybe.


In [None]:
fig,ax= plt.subplots(nrows=3,ncols=1,figsize=(10,10))

sns.scatterplot(data=df,x='Weight_in_gms',y='Cost_of_the_Product',hue='Product_importance',ax=ax[0])
ax[0].title.set_text('Weight vs cost by product importance')
sns.scatterplot(data=df,x='Weight_in_gms',y='Cost_of_the_Product',hue='Mode_of_Shipment',ax=ax[1])
ax[1].title.set_text('Weight vs cost by shipment mode')
sns.scatterplot(data=df,x='Weight_in_gms',y='Cost_of_the_Product',hue='Reached',ax=ax[2])
ax[2].title.set_text('Weight vs cost by Reached')

ax[0].set_xlim(500,8000) 
ax[1].set_xlim(500,8000) 
ax[2].set_xlim(500,8000) 

plt.show()

1. 1st scattered plot - It's clearly visible that there are few outliers, products having low cost but very heavy weight, this is happening in ship mode of delivery and one outlier is also for road delivery mode.
2. 2nd scattered plot - The outliers belongs to medium importance products list.
3.All of them are delayed products

In [None]:
Outliers=df[(df['Cost_of_the_Product']<= 180.0) & (df['Weight_in_gms']>6200.0)]
Outliers

In [None]:
#droping outliers observed in scatterplot
df.drop(df[(df['Cost_of_the_Product']<= 180.0) & (df['Weight_in_gms']>6200.0)].index,inplace=True)

In [None]:
df

In [None]:
 
# One-hot encoding
df_encoded=pd.get_dummies(df,columns= ['Mode_of_Shipment','Warehouse_block','Product_importance'])
df_encoded

# Binary encoding
df_encoded['Gender'] = df_encoded['Gender'].replace({'F': 0, 'M': 1})
   

In [None]:
df_encoded.columns

## Model Building

In [None]:
X=df_encoded[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Mode_of_Shipment_Flight',
       'Mode_of_Shipment_Road', 'Mode_of_Shipment_Ship', 'Warehouse_block_A',
       'Warehouse_block_B', 'Warehouse_block_C', 'Warehouse_block_D',
       'Warehouse_block_F', 'Product_importance_high',
       'Product_importance_low', 'Product_importance_medium']]
X.head()

In [None]:
y=df_encoded[['Reached.on.Time_Y.N']]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import roc_auc_score , plot_roc_curve, accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

dt = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=7)
ad = AdaBoostClassifier()
svm= SVC(random_state = 43, C = 10, gamma = 0.1, kernel ='rbf')
rf= RandomForestClassifier()
xg = XGBClassifier()
#xgb = XGBClassifier(use_label_encoder=False, random_state = 43)

models = [ dt,ad,rf]
for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    scores = cross_val_score(model, X, y, cv=5).mean().round(3)
    accuracy = metrics.accuracy_score(y_test, y_pred).round(3)
    ROC = metrics.roc_auc_score(y_test, y_pred).round(3)
    #f1score = metrics.f1_score(y_test, y_pred).round(3)
    print(model, '\n', 'Accuracy:', accuracy,'\n', 'mean_CV_score:',scores, '\n' , 'ROC:', ROC,'\n')

Adaboost model has given good accuracy and ROC as well

In [None]:
ad.feature_importances_  

In [None]:
features=x_train.columns
importances = rf.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

In [None]:
features=x_train.columns
importances = dt.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

In [None]:
features=x_train.columns
importances = ad.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

## Creating Adaboost classifier - using random forest as weak learner here

In [None]:
# Create adaboost classifer object
abc =AdaBoostClassifier(n_estimators=50, base_estimator=rf,learning_rate=1)

# Train Adaboost Classifer
model = abc.fit(x_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(x_test)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
features=x_train.columns
importances = model.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

This method doesn't much improve the accuracy or information

### Decision Tree visualization

In [None]:
from sklearn import tree

In [None]:
text_representation = tree.export_text(dt)
print(text_representation)

In [None]:
Features=X.columns
Features
target=df['Reached'].unique().tolist()
target

### Decision Tree with Gini Index

In [None]:
fig = plt.figure(figsize=(25,20))
from sklearn.tree import plot_tree
_ = tree.plot_tree(dt, 
          feature_names = Features, 
          class_names = target, 
          filled = True, 
          rounded = True)

In [None]:
fig.savefig("decistion_tree.png")

In [None]:
#Checking cofusion Metrics
print(confusion_matrix(y_test,y_pred))

# PCA

In [None]:
PCA_set= df_encoded.loc[:,['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Mode_of_Shipment_Flight',
       'Mode_of_Shipment_Road', 'Mode_of_Shipment_Ship', 'Warehouse_block_A',
       'Warehouse_block_B', 'Warehouse_block_C', 'Warehouse_block_D',
       'Warehouse_block_F', 'Product_importance_high',
       'Product_importance_low', 'Product_importance_medium']]

In [None]:
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(PCA_set)
PCA_set

In [None]:
feat_cols = ['feature'+str(i) for i in range(x.shape[1])]

In [None]:
set = pd.DataFrame(x,columns=feat_cols)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_values = pca.fit_transform(x)

In [None]:
principal_df = pd.DataFrame(data = pca_values
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
principal_df.tail()

## Customer Segmentation

In [None]:
Cust_segment = X

In [None]:
from sklearn.cluster import KMeans
wcss = []
for k in range(1,11):
    kmeans=KMeans(n_clusters=k, init="k-means++")
    kmeans.fit(Cust_segment)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.plot(range(1,11),wcss)
plt.title("Elbow")
plt.show()

In [None]:
kmeans= KMeans(n_clusters=3, init="k-means++")
cluster=kmeans.fit_predict(Cust_segment)
cluster

In [None]:
pd.Series(cluster).value_counts()

In [None]:
df['label']=cluster

In [None]:
df['label'].unique()

In [None]:
df

In [None]:
plt.scatter(data=df,x='Weight_in_gms', y='Discount_offered', c='label')
plt.scatter(data=df,x='Weight_in_gms', y='Cost_of_the_Product', c='label')
plt.scatter(data=df,x='Reached.on.Time_Y.N', y='label')

In [None]:
from mpl_toolkits import mplot3d


In [None]:
%matplotlib notebook
fig=plt.figure(figsize=(12,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df.Prior_purchases[df.label == 0], df["Reached.on.Time_Y.N"][df.label == 0], 
           df["Discount_offered"][df.label == 0], c='blue', s=60)
ax.scatter(df.Prior_purchases[df.label == 1], df["Reached.on.Time_Y.N"][df.label == 1], 
           df["Discount_offered"][df.label == 1], c='red', s=60)
ax.scatter(df.Prior_purchases[df.label == 2], df["Reached.on.Time_Y.N"][df.label == 2], 
           df["Discount_offered"][df.label == 2], c='green', s=60)

ax.view_init(30, 185)

plt.show()

In [None]:
df[df.label == 1]

### High Payment Buyers Analysis

In [None]:
High_Payment_buyers = df[df['Cost_of_the_Product']>210]
High_Payment_buyers

In [None]:
#Categorical Data Analysis
Data = High_Payment_buyers[['Product_importance','Reached','Gender','Prior_purchases']]
i = 1
plt.figure(figsize=(25,15))
for col in Data:
    plt.subplot(3,6,i)
    x=Data[col].value_counts()
    plot = x.plot.pie(fontsize=10, autopct='%1.0f%%')
    i +=1

57% of orders are getting delayed for high payment buyers. 

In [None]:
sns.barplot(data=High_Payment_buyers,x='Prior_purchases',y='Cost_of_the_Product')

All the high payment buyers are repeat customers, they are either purchasing for 2nd time or more.

In [None]:
Customer_with_bad_experience = High_Payment_buyers[ 
                    (High_Payment_buyers['Customer_care_calls']>3) & 
                    (High_Payment_buyers['Reached']=='Delayed')].sort_values(by=[
    'Customer_care_calls','Customer_rating'])

Customer_with_bad_experience['Customer_care_calls'].value_counts()

These customers has called more than 3 times still the order delayed for them. 