In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('classic')
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore')

# Loading Data.

An international e-commerce company based wants to discover key insights from their customer database. They want to use some of the most advanced machine learning techniques to study their customers. The company sells electronic products.

#### Feature description:

**ID:** ID Number of Customers.

**Warehouse block:** The Company have big Warehouse which is divided in to block such as A,B,C,D,E.

**Mode of shipment:** The Company Ships the products in multiple way such as Ship, Flight and Road.

**Customer care calls:** The number of calls made from enquiry for enquiry of the shipment.

**Customer rating:** The company has rated from every customer. 1 is the lowest (Worst), 5 is the highest (Best).

**Cost of the product:** Cost of the Product in US Dollars.

**Prior purchases:** The Number of Prior Purchase.

**Product importance:** The company has categorized the product in the various parameter such as low, medium, high.

**Gender:** Male and Female.

**Discount offered:** Discount offered on that specific product.

**Weight in gms:** It is the weight in grams.

**Reached on time:** It is the target variable, where 1 Indicates that the product has NOT reached on time and 0 indicates it has reached on time.

In [None]:
data = pd.read_csv('../input/customer-analytics/Train.csv')

data

# Data Preprocessing.

#### Steps involved in Data Preprocessing:

**1)** Data cleaning

**2)** Removing outliers

**3)** Encoding categorical variables

In [None]:
data.isnull().sum()

In [None]:
data.nunique()

In [None]:
cols = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating',
        'Prior_purchases', 'Product_importance', 'Gender', 'Reached.on.Time_Y.N']

fig = plt.figure(figsize = (20, 18), facecolor='#fbe7dd')



for i in range(len(cols)):
    fig.add_subplot(4, 2, i+1)
    sns.countplot(data[cols[i]], palette='icefire_r')

plt.show()

In [None]:
data = data.drop('ID', axis=1) #the column is just indexes


#Assigning the featurs as X and trarget as y

y = data['Reached.on.Time_Y.N']
X = data.drop('Reached.on.Time_Y.N', axis=1)

In [None]:
# the featurs types

ordinal = [
    'Gender',
    'Product_importance'
]

nominal = [
    'Warehouse_block',
    'Mode_of_Shipment'
]

scal = [
    'Cost_of_the_Product',
    'Discount_offered',
    'Weight_in_gms'
]


# Building pipelins

transformer_for_ordinal = Pipeline([
    ('ordinal', OrdinalEncoder(categories='auto'))
])
transformer_for_nominal = Pipeline([
    ('nominal', OneHotEncoder())
])

transformer_for_scal = Pipeline([
    ('scaler', MinMaxScaler())
])

In [None]:
Transformer = ColumnTransformer(transformers=[
    ('ordinal', transformer_for_ordinal, ordinal),
    ('nominal', transformer_for_nominal, nominal),
    ('scal', transformer_for_scal, scal)
], sparse_threshold=0)

In [None]:
data_tran = pd.DataFrame(Transformer.fit_transform(X))
data_tran['y'] = y

data_tran

In [None]:
# correlation matrix

plt.figure(figsize = (18, 9), facecolor='#fbe7dd')
sns.heatmap(data_tran.corr(), annot = True, vmin=-1, vmax=1, fmt = '.2f', annot_kws={'size' : 15}, cmap= 'icefire')
plt.show()

We can clearly spot outliers. Next up, we will remove these data.

In [None]:
corr = data_tran.corr()

In [None]:
X_tran = data_tran[
    corr[abs(corr['y']) > 0.005]['y'].index
]

X_tran = X_tran.drop(['y'], axis=1)

X_tran

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tran, y, train_size=0.75, shuffle=True, random_state=42)

# Testing the Models.


In [None]:
Models = {
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                   Logistic Regression": LogisticRegression(),
    "                                   KNN": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier()
}

In [None]:
# Models Evaluation

for name, model in Models.items():
    model.fit(X_train, y_train)

    print(name + ": {:1.2f}%".format(model.score(X_test, y_test) * 100))

In the above scores, Gradient Boosting appears to be the model with the best accuracy scoring. 