In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**The dataset used for model building contained 10999 observations of 12 variables. The data contains the following information:**

1. ID: ID Number of Customers.
2. Warehouse block: The Company have big Warehouse which is divided in to block such as A,B,C,D,E.
3. Mode of shipment:The Company Ships the products in multiple way such as Ship, Flight and Road.
4. Customer care calls: The number of calls made from enquiry for enquiry of the shipment.
5. Customer rating: The company has rated from every customer. 1 is the lowest (Worst), 5 is the highest (Best).
6. Cost of the product: Cost of the Product in US Dollars.
7. Prior purchases: The Number of Prior Purchase.
8. Product importance: The company has categorized the product in the various parameter such as low, medium, high.
9. Gender: Male and Female.
10. Discount offered: Discount offered on that specific product.
11. Weight in gms: It is the weight in grams.
12. Reached on time: It is the target variable, where 1 Indicates that the product has NOT reached on time and 0 indicates it has reached on time

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("../input/customer-analytics/Train.csv")

In [None]:
df.head()

In [None]:
print("Number of Rows:",df.shape[0])
print("Number of Columns:",df.shape[1])

In [None]:
df.info()

****Data Pre Processing****

*Renaming Coulumn*

In [None]:
df.rename(columns={"Reached.on.Time_Y.N":"Arrival_Time"},inplace=True)

In [None]:
df.columns

*Checking for Null Values*

In [None]:
df.isnull().sum()

Observation : There are no null values in the dataset.

*Removing unwanted Column 'ID'*

In [None]:
df.drop(columns = 'ID',inplace=True)

In [None]:
df.head(2)

# Visualisation

***Plotly Plots***

In [None]:
import plotly.express as px
from plotly import graph_objects as go
import plotly.figure_factory as ff
import plotly.offline as py 

In [None]:
time_count = pd.DataFrame(df["Arrival_Time"].value_counts()).reset_index()
time_count.rename(columns={"index": "Arrival Time","Arrival_Time":"Count"},inplace=True)
time_count['Percentage']=round((time_count['Count']/ time_count['Count'].sum()*100),2)

In [None]:
time_count

1.How Many Shipments Have arrived On Time?

In [None]:
fig = px.bar(time_count,x="Arrival Time",y='Count',title='Arrival Time Distribution',width=500,height=500,
             hover_name='Percentage')
fig.show()

Only 40% of the the products have reached the destination at time , almost 60% had a delayed delivery time.

2.How many items are there in Each WareHouse?

In [None]:
whb_count = pd.DataFrame(df["Warehouse_block"].value_counts()).reset_index()
whb_count.rename(columns={"index": "Warehouse_block","Warehouse_block":"Count"},inplace=True)

In [None]:
whb_count

In [None]:
fig = px.pie(whb_count, values='Count', names='Warehouse_block',title="Items in Each WareHouse Block")
fig.show()

33% of the goods are stored in Warehouse F, and others warehouses store only 16.7%.

3.Plotly Sunburst

In [None]:
fig = px.sunburst(df, path=['Gender','Mode_of_Shipment','Product_importance','Warehouse_block'], 
                   color=df['Cost_of_the_Product'],
                  color_continuous_scale='RdBu')
fig.show()

4.Plotly Box Plot

In [None]:
flight = df[df["Mode_of_Shipment"]=="Flight"]
ship = df[df["Mode_of_Shipment"]=="Ship"]
road = df[df["Mode_of_Shipment"]=="Road"]

trace = go.Box(y = flight["Cost_of_the_Product"],fillcolor="aqua", name= "Flight" )
trace1 = go.Box(y = ship["Cost_of_the_Product"], fillcolor="pink", name= "Ship" )
trace2 = go.Box(y = road["Cost_of_the_Product"], fillcolor="teal", name= "Road" )

layout = go.Layout(title="Cost Distribution w.r.t Mode of Shipment", 
                   yaxis=dict(title="Cost of Product"), 
                   xaxis= dict(title="Mode of Shipment"))

data=[trace, trace1, trace2]
fig = go.Figure(data = data, layout=layout)
py.iplot(fig)

In [None]:
ontime = df[df["Arrival_Time"]==0]
delay = df[df["Arrival_Time"]==1]

trace = go.Box(y = ontime["Cost_of_the_Product"],fillcolor="aqua", name= "Ontime" )
trace1 = go.Box(y = delay["Cost_of_the_Product"], fillcolor="pink", name= "Delayed" )

layout = go.Layout(title="Cost Distribution w.r.t Arrival TIme", 
                   yaxis=dict(title="Cost of Product"), 
                   xaxis= dict(title="Arrival Time"))

data=[trace, trace1]
fig = go.Figure(data = data, layout=layout)
py.iplot(fig)

5.Histogram

In [None]:
x = df["Cost_of_the_Product"]

hist_data = [x]
group_labels = ['distplot']

fig = ff.create_distplot(hist_data = hist_data,
                         group_labels = group_labels)
fig.show()

# Dummy Variables

In [None]:
df = pd.get_dummies(df, prefix_sep="_", drop_first=True)

In [None]:
df.head(2)

# CORRELATION ANALYSIS

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),cmap="Blues",annot=True,annot_kws={"size": 10},linewidths=1)

In [None]:
df.corr()['Arrival_Time'].sort_values().reset_index()

* Weight_in_gms: Negative correlation might be reasonable since it would be harder to handle shipment of heavier products.
* Cost_of_the_Product: Weak negative correlation.
* Customer_care_calls: Weak negative correlation. The problems with shipment may require more calls.
* Prior_purchases: Weak negative correlation. Customer acquisition might be main strategy rather than customer retention. However, magnitude of correlation is too low to make a certain comment on it.
* Customer_rating: Weak Positive correlation.
* Discount_offered: Positive correlation. Probably, high discount rates are offered to more important customers whose shipments are priortized to be completed on time.
* Warehouse_block,Mode_of_Shipment,Product_importance and Gender and very weak correlation on Arrival Time of the sipment. So we can omit those fields for our modelling

# DIMENSIONALITY REDUCTION (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [None]:
X=df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms']]

In [None]:
X.head()

#### PCA without normalization

In [None]:
pcs_w = PCA()
pcs_w.fit(X)

In [None]:
pcs_w_Summary_df = pd.DataFrame({"Standard deviation" : np.sqrt(pcs_w.explained_variance_),
                          "Proportion of Variance" : pcs_w.explained_variance_ratio_,
                          "Cumulative Proportion" : np.cumsum(pcs_w.explained_variance_ratio_)})

In [None]:
pcs_w_Summary_df = pcs_w_Summary_df.transpose()
pcs_w_Summary_df.columns = ['PC'+str(i) for i in range(1, len(pcs_w_Summary_df.columns)+1)]
pcs_w_Summary_df.round(3)

Without Normalization only one component is required to explain 99% variance.

### PCA with Normalization

In [None]:
#normalising
shipment_norm = preprocessing.scale(X)

In [None]:
pcs = PCA()
pcs.fit(shipment_norm)

In [None]:
pcsSummary_df = pd.DataFrame({"Standard deviation" : np.sqrt(pcs.explained_variance_),
                          "Proportion of Variance" : pcs.explained_variance_ratio_,
                          "Cumulative Proportion" : np.cumsum(pcs.explained_variance_ratio_)})

In [None]:
pcsSummary_df = pcsSummary_df.transpose()

In [None]:
pcsSummary_df.columns = ['PC'+str(i) for i in range(1, len(pcsSummary_df.columns)+1)]
pcsSummary_df.round(3)

 To explain 90% variance 5 components are required

# CLASSIFICATION

In [None]:
pca = PCA(n_components=6)
pca_reduced = pca.fit_transform(shipment_norm)

In [None]:
pca_reduced.shape

In [None]:
y = df['Arrival_Time']

In [None]:
#importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(pca_reduced, y, test_size = 0.33, random_state = 42)

### 1. LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()
lg.fit(xtrain,ytrain)

predictions_logg=lg.predict(xtest)

In [None]:
print(confusion_matrix(ytest,predictions_logg))
print(classification_report(ytest,predictions_logg))

In [None]:
#10 Fold Cross Validation
accuracies = cross_val_score(estimator = lg, X=xtrain, y=ytrain, cv=10)
accuracies
print("10 Fold Cross Validation:",accuracies)
print("Mean Accuracy:",np.mean(accuracies))

### 2.NAIVE BAYES CLASSIFICATION

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(xtrain, ytrain)

ypred = nb.predict(xtest)

In [None]:
print(confusion_matrix(ytest,ypred))
print(classification_report(ytest,ypred))

#10 Fold Cross Validation
accuracies = cross_val_score(estimator = nb, X=xtrain, y=ytrain, cv=10)
accuracies
print("10 Fold Cross Validation:",accuracies)
print("Mean Accuracy:",np.mean(accuracies))

### 3.RANDOM FOREST CLASSIFIER

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)
# fit the model
rfc.fit(xtrain, ytrain)
# Predict the Test set results
y_predrfc = rfc.predict(xtest)

In [None]:
print(confusion_matrix(ytest,y_predrfc))
print(classification_report(ytest,y_predrfc))

#10 Fold Cross Validation
accuracies = cross_val_score(estimator = rfc, X=xtrain, y=ytrain, cv=10)
accuracies
print("10 Fold Cross Validation:",accuracies)
print("Mean Accuracy:",np.mean(accuracies))

### 4.ADABOOST CLASSIFIER

In [None]:
from sklearn.ensemble  import AdaBoostClassifier
ada_obj = AdaBoostClassifier(random_state=42)
ada_classifier = ada_obj.fit(xtrain, ytrain)
y_pred_ada = ada_classifier.predict(xtest)

In [None]:
print(confusion_matrix(ytest,y_pred_ada))
print(classification_report(ytest,y_pred_ada))

#10 Fold Cross Validation
accuracies = cross_val_score(estimator = ada_classifier, X=xtrain, y=ytrain, cv=10)
accuracies
print("10 Fold Cross Validation:",accuracies)
print("Mean Accuracy:",np.mean(accuracies))

## ROC TESTS

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc

models = [
     {
        'label': ' Logistic Regression',
        'model': lg
    },
    {
        'label': 'Naive Bayes Classification',
        'model': nb
    },
    {
        'label': 'Random Forest Classification',
        'model': rfc
    },
    {
        'label': 'Adaboost Classification',
        'model': ada_classifier
    }
]

plt.clf()
plt.figure(figsize=(8,6))
for m in models:
    m['model'].probability = True
    probas = m['model'].fit(xtrain,ytrain).predict_proba(xtest)
    fpr, tpr, thresholds = roc_curve(ytest, probas[:, 1])
    roc_auc  = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=0, fontsize='small')
plt.show()

Random Forest Classfier gives the highest accuracy compared to other models.