# E Commerce-Churn Prediction

# Introduction
It can cost anywhere between 5 and 25 times more to attract new customers than it does to retain existing ones. Statistics show an increase in customer retention by 5% can lead to a company’s profits growing by 25% to around 95% over a period of time. So I will build ML Model for Customer Churn Prediction.



<font color = 'blue'>
Content: 

1. [Load and Check Data](#1)
2. [Exploratory Data Analysis](#2)
    * [Analysis of Numerical and Categorical Variables](#3)
    * [Analysis of Numerical Variables by Target](#4)
    * [Analysis of Categorical Variables by Target](#5)
    * [Correlation](#6)
7. [Feature Engineering](#7)
    * [Missing Value](#8)
    * [Outliers](#9)
    * [Encoding](#10)
11. [Modelling](#11)
    * [Gradient Boosting](#12)
    * [Logistic Regression](#13)
    * [Random Forest](#14)
    * [Feature Selection](#15)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
import warnings
warnings.simplefilter(action="ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 170)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


<a id = "1"></a><br>
# Load and Check Data

In [None]:
df = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='E Comm')

In [None]:
def check_df(dataframe, head=5):
    print("------------------- Shape -------------------")
    print(dataframe.shape)
    print("------------------- Types -------------------")
    print(dataframe.dtypes)
    print("------------------- Head --------------------")
    print(dataframe.head(head))
    print("------------------- Tail --------------------")
    print(dataframe.tail(head))
    print("------------------- NA ----------------------")
    print(dataframe.isnull().sum())
    print("------------------- Quantiles -------------------")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='O') 

<a id = "2"></a><br>
# Exploratory Data Analysis

In [None]:
sns.countplot(x="Churn",data=df)
plt.show()

<a id = "3"></a><br>
## Analysis of Numerical and Categorical Variables

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
      # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
print(cat_cols)

In [None]:
print(num_cols)

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()


for col in cat_cols:
    cat_summary(df, col)

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()


for col in num_cols:
    num_summary(df, col, plot=True)

In [None]:
df["Tenure"].value_counts().head()

<a id = "4"></a><br>
## Analysis of Numerical Variables According to Target

In [None]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")


for col in num_cols:
    target_summary_with_num(df, "Churn", col)

<a id = "5"></a><br>
## Analysis of Categorical Variables by Target

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(categorical_col)
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean(),
                        "Count": dataframe[categorical_col].value_counts(),
                        "Ratio": 100 * dataframe[categorical_col].value_counts() / len(dataframe)}), end="\n\n\n")


for col in cat_cols:
    target_summary_with_cat(df, "Churn", col)

<a id = "6"></a><br>
## Correlation

In [None]:
df[num_cols].corr()

df.corr()

# Correlation Matrix
f, ax = plt.subplots(figsize=[18, 13])
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", ax=ax, cmap="magma")
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

<a id = "7"></a><br>
# Feature Engineering

<a id = "8"></a><br>
## Missing Value

In [None]:
df.isnull().sum()

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

na_columns = missing_values_table(df, na_name=True)

In [None]:
df["DaySinceLastOrder"].fillna(df["DaySinceLastOrder"].median(), inplace=True)
df["OrderAmountHikeFromlastYear"].fillna(df["OrderAmountHikeFromlastYear"].median(), inplace=True)
df["Tenure"].fillna(df["Tenure"].median(), inplace=True)
df["OrderCount"].fillna(df["OrderCount"].median(), inplace=True)
df["CouponUsed"].fillna(df["CouponUsed"].median(), inplace=True)
df["HourSpendOnApp"].fillna(df["HourSpendOnApp"].median(), inplace=True)
df["WarehouseToHome"].fillna(df["WarehouseToHome"].median(), inplace=True)

In [None]:
df.isnull().sum()

<a id = "9"></a><br>
## Outliers

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
def replace_with_thresholds(dataframe, variable, q1=0.05, q3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1=0.05, q3=0.95)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    

In [None]:
for col in num_cols:
    replace_with_thresholds(df,col)

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

<a id = "10"></a><br>
## Encoding

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]
binary_cols

cat_cols = [col for col in cat_cols if col not in binary_cols and col not in ["Churn"]]

for col in binary_cols:
    df = label_encoder(df, col)

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

cat_cols = [col for col in cat_cols if col not in binary_cols and col not in ["Churn"]]
cat_cols

df = label_encoder(df, "Gender")

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe


df = one_hot_encoder(df, cat_cols, drop_first=True)

In [None]:
print(df.shape)
df.head()

<a id = "11"></a><br>
# Modelling

<a id = "12"></a><br>
## Gradient Boosting

In [None]:
y = df["Churn"]
X = df.drop(["Churn"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

catboost_model = CatBoostClassifier(verbose=False, random_state=12345).fit(X_train, y_train)
y_pred = catboost_model.predict(X_test)

print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 2)}")
print(f"Recall: {round(recall_score(y_pred,y_test),2)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 2)}")
print(f"F1: {round(f1_score(y_pred,y_test), 2)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 2)}")

<a id = "13"></a><br>
## Logistic Regression

In [None]:
# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

print(type(lr))
scores = cross_val_score(lr,X_train,y_train,cv=5,scoring='roc_auc')
print ("CV score :",scores.mean())
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(classification_report(y_test, pred))

<a id = "14"></a><br>
## Random Forest

In [None]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

print(type(rfc))
scores = cross_val_score(rfc,X_train,y_train,cv=5, scoring='roc_auc')
print ("CV score :",scores.mean())
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

<a id = "15"></a><br>
## Feature Selection

In [None]:
def plot_feature_importance(importance,names,model_type):
    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data = {'feature_names': feature_names, 'feature_importance': feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)

    # Define size of bar plot
    plt.figure(figsize=(25, 10))
    # Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    # Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.show()


plot_feature_importance(catboost_model.get_feature_importance(), X.columns, 'CATBOOST')
