In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## 1. Data Extraction

In [None]:
data = pd.read_csv("../input/customer-analytics/Train.csv")

In [None]:
data.head()

#### a. Data Exploration

The data contains the following information: ID: ID Number of Customers. Warehouse block: The Company have big Warehouse which is divided in to block such as A,B,C,D,E. Mode of shipment:The Company Ships the products in multiple way such as Ship, Flight and Road. Customer care calls: The number of calls made from enquiry for enquiry of the shipment. Customer rating: The company has rated from every customer. 1 is the lowest (Worst), 5 is the highest (Best). Cost of the product: Cost of the Product in US Dollars. Prior purchases: The Number of Prior Purchase. Product importance: The company has categorized the product in the various parameter such as low, medium, high. Gender: Male and Female. Discount offered: Discount offered on that specific product. Weight in gms: It is the weight in grams. Reached on time: It is the target variable, where 1 Indicates that the product has NOT reached on time and 0 indicates it has reached on time.

In [None]:
data.info()

In [None]:
m = data.shape[0]
n = data.shape[1]

print("Number of rows: " + str(m))
print("Number of columns: " + str(n))

#### b. Data Cleansing

In [None]:
# check missing values
data.isnull().sum()

In [None]:
# check duplicates
print("Number of duplicated data:", data.duplicated().sum())
print("Number of duplicated ID:", data["ID"].duplicated().sum())

#### c. Features and Label

In [None]:
# drop id column 
data.drop(["ID"], axis = 1, inplace=True)

In [None]:
data_div = data.copy()

# converting
data_div['Reached.on.Time_Y.N'] = data_div['Reached.on.Time_Y.N'].astype('category')
data_div['Reached.on.Time_Y.N'].replace(1,'Not_On_Time', inplace=True)
data_div['Reached.on.Time_Y.N'].replace(0,'On_Time', inplace=True)

# separate features and target
label=data_div['Reached.on.Time_Y.N']
numericals = data_div.select_dtypes(include=['number'])
categoricals = data_div.drop(['Reached.on.Time_Y.N'], axis=1).select_dtypes(exclude=['number'])

print("\nlabel column:", label.name)
print("__________________________\n")
print("numerical columns:\n", numericals.columns)
print("__________________________\n")
print("categorical columns:\n", categoricals.columns)

<font color="darkblue">
    <b><i>All features, except "ID", will be used to see the correlation and insights so there are no too many assumptions</i></b>
</font>

## 2. Exploratory Data Analysis

### 2.1 Descriptive Statistics

In [None]:
numericals.describe()

In [None]:
categoricals.describe()

<font color="darkblue">
    <b><i>There is a  significant difference between mean and median of 'Discount_offered' and 'Weight_in_gms' indicating skewed distribution</i></b>
</font>

### 2.2 Univariate Analysis

#### a. Countplot

In [None]:
label.value_counts()

In [None]:
# plot
ax = sns.countplot(label, palette='RdBu')

total = float(len(label))
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()/2
    y = p.get_height() + 150
    ax.annotate(percentage, (x, y), ha='center', fontsize=15)
    
# settings
plt.xlabel('Arrival Time', fontsize = 13, labelpad = 20)
plt.ylabel('Frequency', fontsize = 13, labelpad = 20)
plt.title('Number of Product Orders that on Time and Delayed are almost Balance', fontsize = 16, pad = 30)
sns.despine(top=True, right=True, left=False, bottom=False)
ax.spines['left'].set_color('lightgray')
ax.spines['bottom'].set_color('lightgray')

<font color="darkblue">
    <b><i>The graph above shows that arrival time categories that are 'On Time' and 'Not On Time' have an almost balanced order frequency</i></b>
</font>

In [None]:
plt.figure(figsize=(10,5))

# plot
total = float(len(data_div)) 

ax = sns.countplot(x="Warehouse_block", data=data_div, hue='Reached.on.Time_Y.N', palette="Greens");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 100,
            '{0:.1%}'.format(height/total),
            ha="center", fontsize=10)
    
# settings 
ax.set_ylabel('Frequency');
# ax.set_title('Warehouse Block F has the Highest Number (19.9%)\nof Products that Arrive are Not On Time', fontsize = 14, pad = 50);
ax.set_title('There is No Significant Different on Arrival Time Ratio Based on Warehouse Block', fontsize = 14, pad = 50);

<font color="darkgreen">
    <ul><b><i>
        <li>ratio D: 1.49</li>
        <li>ratio F: 1.48</li>
        <li>ratio A: 1.42</li>
        <li>ratio B: 1.52</li>
        <li>ratio C: 1.47</li>
    </i></b></ul>
</font>

In [None]:
plt.figure(figsize=(10,5))

# plot
total = float(len(data_div)) 

ax = sns.countplot(x="Mode_of_Shipment", data=data_div, hue='Reached.on.Time_Y.N', palette="Greens");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 100,
            '{0:.1%}'.format(height/total),
            ha="center", fontsize=10)
    
# settings 
ax.set_ylabel('Frequency');
ax.set_title('There is No Significant Different on Arrival Time Ratio Based on Mode of Shipment', fontsize = 14, pad = 50);

<font color="darkgreen">
    <ul><b><i>
        <li>ratio Flight: 1.52</li>
        <li>ratio Ship: 1.48</li>
        <li>ratio Road: 1.42</li>
    </i></b></ul>
</font>

In [None]:
plt.figure(figsize=(10,5))

# plot
total = float(len(data_div)) 

ax = sns.countplot(x="Product_importance", data=data_div, hue='Reached.on.Time_Y.N', palette="Greens");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 100,
            '{0:.1%}'.format(height/total),
            ha="center", fontsize=10)
    
# settings 
ax.set_ylabel('Frequency');
ax.set_title('Product Importance High has the Highest Arrival Time Ratio', fontsize = 14, pad = 50);

<font color="darkgreen">
    <ul><b><i>
        <li>ratio low: 1.45</li>
        <li>ratio medium: 1.44</li>
        <li>ratio high: 1.87</li>
    </i></b></ul>
</font>

In [None]:
plt.figure(figsize=(10,5))

# plot
total = float(len(data_div)) 

ax = sns.countplot(x="Gender", data=data_div, hue='Reached.on.Time_Y.N', palette="Greens");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 100,
            '{0:.1%}'.format(height/total),
            ha="center", fontsize=10)
    
# settings 
ax.set_ylabel('Frequency');
ax.set_title('There is No Significant Different on Arrival Time Based on Gender', fontsize = 14, pad = 50);

<font color="darkgreen">
    <ul><b><i>
        <li>ratio female: 1.47</li>
        <li>ratio male: 1.49</li>
    </i></b></ul>
</font>

#### b. Boxplots

In [None]:
plt.figure(figsize=(20,3))
for i,colour in zip(range(0, len(numericals.columns)),['grey','red','yellow','lightblue','lightgreen','purple']):
    plt.subplot(1,len(numericals.columns),i+1)
    sns.boxplot(numericals[numericals.columns[i]], color=colour)
    plt.tight_layout()

<font color="darkblue">
    <ul><b><i>
        <li>The boxplot graphs above show that 'Prior_purchase' and 'Discount_offered' have a distribution affected by outliers shown on the graph. The outliers are piled up at the right end</li>
         <li>Weight_in_gms has an asymmetric distribution shown by median line that is not in the middle of interquartile range. However, there are no outliers on the graph. It may be affected by the number of modes that is more than 1 and imbalanced. The number of modes indicates there are different groups of Weight</li>
         <li>'Customer_care_calls','Customer_rating','Cost_of_the_Product' tend to have a normal distribution shown by median line in the midle of IQR</li>
    </i></b></ul>
</font>

#### c. Distplots

In [None]:
plt.figure(figsize=(20,3))
for i,colour in zip(range(0, len(numericals.columns)),['grey','red','yellow','lightblue','lightgreen','purple']):
    plt.subplot(1,len(numericals.columns),i+1)
    sns.distplot(numericals[numericals.columns[i]], color=colour)
    plt.tight_layout()

<font color="darkblue">
    <ul><b><i>
        <li>The distribution plots above show that 'Prior_purchase' and 'Discount_offered' have a skewed right (positively skewed) distribution where the data are piled up at the left end. There is a very small portion of the data collected on right side (outliers). It causes the mean > median</li>
         <li>Weight_in_gms has a bimodal distribution. It is affected by 2 imbalanced modes/peaks. It also indicates there are 2 different types of Weight</li>
         <li>'Customer_care_calls','Customer_rating','Cost_of_the_Product' tend to have a symmetric or normal distribution shown by nearly identical if folded in half at the center point of the distribution</li>
          <li>'Customer_rating' has a uniform distribution, the probabilities are exactly the same at each point, so the distribution is basically a straight line</li>
    </i></b></ul>
</font>

### 2.3 Multivariate Analysis

#### a. Pair Grid

In [None]:
from scipy.stats import pearsonr

def reg_coef(x,y,label=None,color=None,**kwargs):
    ax = plt.gca()
    r,p = pearsonr(x,y)
    ax.annotate('r = {:.2f}'.format(r), xy=(0.5,0.5), xycoords='axes fraction', ha='center', size=20)
    ax.set_axis_off()

g = sns.PairGrid(data=numericals)
g.map_diag(sns.distplot)
g.map_lower(sns.regplot)
g.map_upper(reg_coef);

<font color="darkblue">
    <ul><b><i>
        <li>There are no clear clusters shown on graph</li>
    </i></b></ul>
</font>

#### b. Pair plot + Hue

In [None]:
sns.pairplot(data=data_div, hue = "Reached.on.Time_Y.N", palette = "RdBu");

<font color="darkblue">
    <ul><b><i>
        <li>Product orders that on time tend to have the lowest 'Discount offered' and tend to have a highest and lowest 'Weight_in_gms'</li>
        <li>The on time and not on time categories on feature correlations between 'Discount_offered' and  the other features, as well as feature correlation between 'Weight_in_gms' and  the other features, tend to have well separated indicating a good combination of features</li>
    </i></b></ul>
</font>

#### c. Correlation Heatmap

In [None]:
# compute corr
numericals.corr(method = "pearson")

In [None]:
# compute correlation 
corr = numericals.corr(method = "pearson")

# mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

# set figure size
f, ax = plt.subplots(figsize = (10,10))

# set colormap
cmap = sns.diverging_palette(220, 10, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, 
           vmin = -1, vmax = 1, center = 0,
           linewidths = .5, cbar_kws = {"shrink": .5}, annot = True);

<font color="darkblue">
    <ul><b><i>
        <li>There are no redundant features, no strong correlated features</li>
    </i></b></ul>
</font>