# **Import necessary libraries and Load data from database**

In [None]:
import logging

logger = logging.getLogger()

file_handler = logging.FileHandler(filename='ecc_fe_log.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

logger.setLevel(logging.DEBUG)
logger.info('FILE STARTS RUNNING!')

INFO:root:FILE STARTS RUNNING!


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scipy.stats as stats
from scipy.stats import chi2_contingency

from pymongo import MongoClient

import warnings
warnings.filterwarnings('ignore')

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [None]:
logger.info('Libraries imported!')

INFO:root:Libraries imported!


In [None]:
# Connect to Mongodb cluster
client = MongoClient("mongodb+srv://<username>:<password>@mycluster.g3bp8fr.mongodb.net/?retryWrites=true&w=majority")

logger.info('Database connected!')

# List of databases
db_list = client.list_database_names()
db_list

INFO:root:Database connected!


['CHURN', 'loan_db', 'admin', 'local']

In [None]:
db = client.CHURN

In [None]:
cursor = db.ecomm_churn.find()

df = pd.DataFrame(list(cursor)).drop('_id', axis=1)
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile,3,6.0,DC,Female,3.0,3,Laptop,2,Single,9,1,11.0,1.0,1.0,5.0,159.93
1,50002,1,10.0,Mobile,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,120.9
2,50003,1,10.0,Mobile,1,30.0,DC,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120.28
3,50004,1,0.0,Mobile,3,15.0,DC,Male,2.0,4,Laptop,5,Single,8,0,23.0,0.0,1.0,3.0,134.07
4,50005,1,0.0,Mobile,1,12.0,CC,Male,3.0,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,129.6


In [None]:
logger.info('Data loaded from database!')

INFO:root:Data loaded from database!


In [None]:
df.shape

(5630, 20)

# **Feature Engineering**

In [None]:
logger.info('Feature engineering starts!')

INFO:root:Feature engineering starts!


## **Outlier Handling**

Most of the outliers we observed during exploration of the data seems to represent the natural variation in the population(of the respective variable). Therefore, we leave them except for a few variables like Tenure, DaySinceLastOrder, CashbackAmount and NumberOfAddress.

 We handle them one by one.

**Tenure**

According to the percentile distribution of 'Tenure', we observe that customers with Tenure > 21, have not churned and the 99th percentile value is 31 which means the extreme values lies after that. So lets cap at the value 31, ie, all values above 31 will be considered as 31.

In [None]:
percentile = df.Tenure.quantile([0.99]).values
df['Tenure'] = df['Tenure'].apply(lambda x : percentile[0] if x > percentile[0] else x)

**DaySinceLastOrder**

Similarly for DaySinceLastOrder, according to the percentile distribution, we see that values after DaySinceLastOrder=18 are extreme values [30,31,46]. The 99th percentile values is 15. So we shall cap at this position.

In [None]:
percentile = df.DaySinceLastOrder.quantile([0.99]).values
df['DaySinceLastOrder'] = df['DaySinceLastOrder'].apply(lambda x : percentile[0] if x > percentile[0] else x)

**CashbackAmount**
Here we see that there are outliers after the upper bound and below the lower bound. We cap the values at the 1st and 99th percentile.


In [None]:
percentile = df.CashbackAmount.quantile([0.01, 0.99]).values
df['CashbackAmount'] = df['CashbackAmount'].apply(lambda x : percentile[0] if x < percentile[0] else percentile[1] if x > percentile[1] else x)

## **Feature selection using Statistical Test**

We perform the following tests to select features that would contribute the best for our model.

In [None]:
# Function to perform statistical test on numerical variables to determine whether to reject or accept H0
def num_stats(num_col):
  group_0 = df[df['Churn']==0][num_col]
  group_1 = df[df['Churn']==1][num_col]

  _, p_value = stats.ttest_ind(group_0,group_1,equal_var=False)

  print('P-value : ', p_value)
  if(p_value<0.05):
    print('Reject null hypothesis')
  else:
    print('Do not reject null hypotheis')

In [None]:
def chisq_test(cat_col, df):
    CrossTabResult=pd.crosstab(index=df['Churn'], columns=df[cat_col])
    ChiSqResult = chi2_contingency(CrossTabResult)
        
    # If the ChiSq P-Value is <0.05, that means we reject H0
    if (ChiSqResult[1] < 0.05):
        print('P-Value :', ChiSqResult[1])
        print('Reject null hypothesis')
    else:
        print('P-Value :', ChiSqResult[1])
        print('Do not reject null hypotheis')        
            

In [None]:
num_col_list = ['Tenure', 'WarehouseToHome', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
                'DaySinceLastOrder', 'CashbackAmount', 'HourSpendOnApp', 'NumberOfDeviceRegistered', 'NumberOfAddress']

In [None]:
for col in num_col_list:
  print('Column Name : ', col)
  num_stats(col)
  print('---------------------------------------------')

Column Name :  Tenure
P-value :  2.008277595367716e-202
Reject null hypothesis
---------------------------------------------
Column Name :  WarehouseToHome
P-value :  7.60206018025374e-09
Reject null hypothesis
---------------------------------------------
Column Name :  OrderAmountHikeFromlastYear
P-value :  0.4237779148876236
Do not reject null hypotheis
---------------------------------------------
Column Name :  CouponUsed
P-value :  0.9159384616337447
Do not reject null hypotheis
---------------------------------------------
Column Name :  OrderCount
P-value :  0.06430615371620459
Do not reject null hypotheis
---------------------------------------------
Column Name :  DaySinceLastOrder
P-value :  3.1021436368382634e-39
Reject null hypothesis
---------------------------------------------
Column Name :  CashbackAmount
P-value :  3.03484589898161e-43
Reject null hypothesis
---------------------------------------------
Column Name :  HourSpendOnApp
P-value :  0.14300103213870466
Do n

In [None]:
cat_col_list = ['PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode',
                'PreferedOrderCat', 'SatisfactionScore', 'MaritalStatus', 'Gender', 'Complain']

In [None]:
for col in cat_col_list:
  print('Column Name : ', col)
  chisq_test(col, df)
  print('---------------------------------------------')

Column Name :  PreferredLoginDevice
P-Value : 0.0001477040239947965
Reject null hypothesis
---------------------------------------------
Column Name :  CityTier
P-Value : 1.2612000812079956e-09
Reject null hypothesis
---------------------------------------------
Column Name :  PreferredPaymentMode
P-Value : 1.4978570960706217e-10
Reject null hypothesis
---------------------------------------------
Column Name :  PreferedOrderCat
P-Value : 3.11924340428766e-61
Reject null hypothesis
---------------------------------------------
Column Name :  SatisfactionScore
P-Value : 2.4233349782737515e-14
Reject null hypothesis
---------------------------------------------
Column Name :  MaritalStatus
P-Value : 1.073011277910542e-41
Reject null hypothesis
---------------------------------------------
Column Name :  Gender
P-Value : 0.030820940334890086
Reject null hypothesis
---------------------------------------------
Column Name :  Complain
P-Value : 2.6644609654641377e-78
Reject null hypothesis


### **Drop unwanted columns based on the test results**

In [None]:
df.drop(columns=['CustomerID', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount'], axis=1, inplace=True)

## **Encoding Categorical Variables**

In [None]:
df_encoded = pd.get_dummies(df,columns=cat_col_list)
df_encoded.head()

Unnamed: 0,Churn,Tenure,WarehouseToHome,NumberOfDeviceRegistered,NumberOfAddress,DaySinceLastOrder,CashbackAmount,PreferredLoginDevice_Computer,PreferredLoginDevice_Mobile,CityTier_1,...,SatisfactionScore_3,SatisfactionScore_4,SatisfactionScore_5,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Gender_Female,Gender_Male,Complain_0,Complain_1
0,1,4.0,6.0,3,9,5.0,159.93,0,1,0,...,0,0,0,0,0,1,1,0,0,1
1,1,10.0,8.0,4,7,0.0,120.9,0,1,1,...,1,0,0,0,0,1,0,1,0,1
2,1,10.0,30.0,4,6,3.0,120.28,0,1,1,...,1,0,0,0,0,1,0,1,0,1
3,1,0.0,15.0,4,8,3.0,134.07,0,1,0,...,0,0,1,0,0,1,0,1,1,0
4,1,0.0,12.0,3,3,3.0,129.6,0,1,1,...,0,0,1,0,0,1,0,1,1,0


In [None]:
logger.info('Feature Engineering ends!')

INFO:root:Feature Engineering ends!


# **Save the data in database**

In [None]:
mydb = client.CHURN

In [None]:
# Creating collection/table 'ecomm_churn_encoded' in CHURN database
mytb1 = mydb.ecomm_churn_encoded

# Convert df_encoded to dictionary
data = df_encoded.to_dict(orient='records') 

# Insert 'data' to ecomm_churn
mytb1.insert_many(data)

<pymongo.results.InsertManyResult at 0x7f68ac9a94f0>

In [None]:
logger.info('Modified data saved in database!')
logger.info('FILE ENDS RUNNING!')

INFO:root:Modified data saved in database!
INFO:root:FILE ENDS RUNNING!
