In [2]:
#import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [3]:
df=pd.read_csv("E_Commerce_Churn_cleaned.csv")

In [4]:
X = df.loc[:, df.columns != 'Churn']


In [5]:
X.head()

Unnamed: 0,CustomerID,PreferredLoginDevice,CityTier,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,Tenure_update,WarehouseToHome_update
0,50001,Phone,3,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93,4.0,6.0
1,50002,Phone,1,UPI,Male,3.0,4,Mobile Phone,3,Single,7,1,15.0,0.0,1.0,0.0,120.9,,8.0
2,50003,Phone,1,Debit Card,Male,2.0,4,Mobile Phone,3,Single,6,1,14.0,0.0,1.0,3.0,120.28,,30.0
3,50004,Phone,3,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07,0.0,15.0
4,50005,Phone,1,CC,Male,,3,Mobile Phone,5,Single,3,0,11.0,1.0,1.0,3.0,129.6,0.0,12.0


In [6]:
Y = df['Churn']

In [7]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 10)
x_train.shape, x_test.shape

((3941, 19), (1689, 19))

<h3>Now we create some columtransformer for individual columns and after that we combine that all ColumnTransformer object into Machine Learning Pipeline.</h3>

In [8]:
#ColumnTransformer for one hot encoding 
num_features = df.select_dtypes(exclude="object").columns
cat_features = df.select_dtypes(include="object").columns

trf1=ColumnTransformer(transformers=[
    ('one_hot' , OneHotEncoder(sparse_output=False,handle_unknown="ignore" ) ,cat_features)
],remainder="passthrough")

In [9]:
trf1.fit_transform(x_train,y_train)

array([[  0.  ,   1.  ,   0.  , ..., 208.55,  13.  ,  10.  ],
       [  0.  ,   1.  ,   1.  , ..., 160.91,   2.  ,   8.  ],
       [  1.  ,   0.  ,   1.  , ..., 151.55,  12.  ,  14.  ],
       ...,
       [  0.  ,   1.  ,   1.  , ..., 144.04,   7.  ,  14.  ],
       [  0.  ,   1.  ,   0.  , ..., 148.4 ,   1.  ,  31.  ],
       [  0.  ,   1.  ,   1.  , ..., 186.33,  20.  ,  31.  ]])

In [10]:
#ColumnTransformer for handling Missing values

trf2=ColumnTransformer([
    ('KNNimputer' ,KNNImputer(n_neighbors=5),slice(32) )
    
],remainder='passthrough')

In [11]:
#ColumnTransformer for feature scalling

trf3=ColumnTransformer([
    ('scale' , StandardScaler() ,[] )
],remainder="passthrough")

In [12]:
#ColumnTransformer for feature selection


In [13]:
rf_classifier=RandomForestClassifier(random_state=10)
lr_classifier=LogisticRegression()
dt_classifier=DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf = 5)
Ab_classifier=AdaBoostClassifier(n_estimators=100)
sv_classifier=SVC()

In [14]:
pipeline_lr=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",lr_classifier)
])

In [15]:
pipeline_rf=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",rf_classifier)
])

In [16]:
pipeline_dt=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",dt_classifier)
])

In [17]:
pipeline_Ab=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",Ab_classifier)
])

In [18]:
pipeline_sv=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",sv_classifier)
])

In [19]:
## LEts make the list of pipelines

pipelines = [pipeline_Ab,pipeline_dt,pipeline_lr,pipeline_rf,pipeline_sv]

In [20]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'AdaBoost', 1: 'Decision Tree', 2: 'Logistic Regression' , 3: 'RandomForest',4:'svc'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(x_train, y_train)

In [21]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(x_test,y_test)))

AdaBoost Test Accuracy: 0.8851391355831854
Decision Tree Test Accuracy: 0.8691533451746596
Logistic Regression Test Accuracy: 0.8507992895204263
RandomForest Test Accuracy: 0.9473060982830077
svc Test Accuracy: 0.8176435760805211


In [22]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [23]:
for i,model in enumerate(pipelines):
    if model.score(x_test,y_test)>best_accuracy:
        best_accuracy=model.score(x_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy  : {}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy  : RandomForest


In [24]:
for i,pipe in enumerate(pipelines ):
 cvscore=cross_val_score(pipe,x_train,y_train,cv=5,scoring="accuracy").mean()
 print (f"{pipe_dict[i]} cross val accuracy {cvscore}")

AdaBoost cross val accuracy 0.896726563857096
Decision Tree cross val accuracy 0.8759227448482626


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression cross val accuracy 0.8530849304845175
RandomForest cross val accuracy 0.9393545772133329
svc cross val accuracy 0.8376046270740447


<h3>Pipelines Perform Hyperparameter Tuning Using Grid SearchCV</h3>

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
# Create a pipeline
pipe = Pipeline([ 
     ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("classifier", RandomForestClassifier())
    ])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10,50,150,100,500, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(x_train,y_train)

50 fits failed out of a total of 3540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\High-Tech\Documents\AI\project_1\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\High-Tech\Documents\AI\project_1\venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\High-Tech\Documents\AI\project_1\venv\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\High-Tech\Documents\AI\project_1\venv\l