# Case Study: Telecom Churn

#### Problem Statement:
In the telecom industry, customers are able to choose from multiple service providers and actively switch from one operator to another. In this highly competitive market, the telecommunications industry experiences an average of 15-25% annual churn rate. Given the fact that it costs 5-10 times more to acquire a new customer than to retain an existing one, customer retention has now become even more important than customer acquisition.o reduce customer churn, telecom companies need to predict which customers are at high risk of churn.

#### Business Objective:
In this project, we will analyse customer-level data of a leading telecom firm, build predictive models to identify customers at high risk of churn and identify the main indicators of churn.

In [None]:
# Suppressing the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
%matplotlib inline

# Styling the plot
style.use('ggplot')
sns.set_style('whitegrid')

import math
import itertools

import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler, scale
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_curve, classification_report
from sklearn.metrics import recall_score, precision_score, roc_auc_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance

import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
# Adjusting the views
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [None]:
# Defining some customized functions which we will use often during this case study

def get_variable_type(element) :
    if element==0:
        return "Not Known"
    elif element < 20 and element!=0 :
        return "Categorical"
    elif element >= 20 and element!=0 :
        return "Contineous" 
    
def get_info(dataframe) :
    infodata_matrix = pd.DataFrame({
                    'Datatype' : dataframe.dtypes.astype(str), 
                    'Non_Null_Count': dataframe.count(axis = 0).astype(int), 
                    'Null_Count': dataframe.isnull().sum().astype(int), 
                    'Null_Percentage': dataframe.isnull().sum()/len(dataframe) * 100, 
                    'Unique_Values_Count': dataframe.nunique().astype(int) 
                     })
    
    infodata_matrix = predict_variable_type(infodata_matrix)
    return infodata_matrix
            
def predict_variable_type(infodata_matrix):
    infodata_matrix["Variable_Type"] = infodata_matrix["Unique_Values_Count"].apply(get_variable_type).astype(str)
    infodata_matrix["frequency"] = infodata_matrix["Null_Count"] - infodata_matrix["Null_Count"]
    infodata_matrix["frequency"].astype(int)
    return infodata_matrix 


def list_potential_categorical_type(dataframe,main) :
    print('Stats for potential categorical datatype columns')
    infodata_matrix_categorical = dataframe[dataframe["Variable_Type"] == "Categorical"]
    # TO DO *** Add check to skip below if there is no Categorical values 
    length = len(infodata_matrix_categorical)
    if length == 0 :
        header_red("No Categorical columns in given dataset.")  
    else :    
        infodata_matrix_categorical = infodata_matrix_categorical.filter(["Datatype","Unique_Values_Count"])
        infodata_matrix_categorical.sort_values(["Unique_Values_Count"], axis=0,ascending=False, inplace=True)
        col_to_check = infodata_matrix_categorical.index.tolist()
        name_list = []
        values_list = []
        for name in col_to_check :
            name_list.append(name)
            values_list.append(main[name].unique())
        temp = pd.DataFrame({"index":name_list,"Unique_Values":values_list})
        infodata_matrix_categorical = infodata_matrix_categorical.reset_index()
        infodata_matrix_categorical = pd.merge(infodata_matrix_categorical,temp,how='inner',on='index')
        display(infodata_matrix_categorical.set_index("index")) 

def plot_data_type_pie_chat(dataframe) : 
        print('Stats for datatype Percentage Distribution')
        dataframe_group = dataframe.groupby("Datatype").frequency.count().reset_index()
        dataframe_group.sort_values(["Datatype"], axis=0,ascending=False, inplace=True)
        trace = go.Pie(labels=dataframe_group["Datatype"].tolist(), values=dataframe_group["frequency"].tolist())
        layout = go.Layout(title="Datatype Percentage Distribution")
        fig = go.Figure(data=[trace], layout=layout)    
        py.offline.iplot(fig)
    
def get_potential_categorical_type(dataframe,main,unique_count) :
    infodata_matrix_categorical = dataframe[dataframe["Variable_Type"] == "Categorical"]
    infodata_matrix_categorical = dataframe[dataframe["Unique_Values_Count"] == unique_count]
    length = len(infodata_matrix_categorical)
    if length == 0 :
        print("No Categorical columns in given dataset.")  
    else :    
        infodata_matrix_categorical = infodata_matrix_categorical.filter(["Datatype","Unique_Values_Count"])
        infodata_matrix_categorical.sort_values(["Unique_Values_Count"], axis=0,ascending=False, inplace=True)
        col_to_check = infodata_matrix_categorical.index.tolist()
        name_list = []
        values_list = []
        for name in col_to_check :
            name_list.append(name)
            values_list.append(main[name].unique())
        temp = pd.DataFrame({"index":name_list,"Unique_Values":values_list})
        infodata_matrix_categorical = infodata_matrix_categorical.reset_index()
        infodata_matrix_categorical = pd.merge(infodata_matrix_categorical,temp,how='inner',on='index')
        display(infodata_matrix_categorical.set_index("index")) 

def heatmap(x,y,dataframe):
    plt.figure(figsize=(x,y))
    sns.heatmap(dataframe.corr(),cmap="YlGnBu",annot=True)
    plt.show()

# 1. Importing the Dataset

In [None]:
# Importing the dataset
telecom = pd.read_csv("../input/telecom-churn/telecom_churn_data.csv")
telecom.head()

In [None]:
# Shape
telecom.shape

In churn prediction, we assume that there are three phases of customer lifecycle:
- The 'Good' Phase
- The 'Action' Phase
- The 'Churn' Phase

# 2. Inspecting the DataFrame

In [None]:
# Columns containing recharge amount
amt_recharge_columns =  telecom.columns[telecom.columns.str.contains('rech_amt|rech_data')]
print(amt_recharge_columns)

`Inference:` Meaningful features:
- Total Recharge Amount
- Total Recharge for Data
- Maximum Recharge Amount
- Average Recharge Amount

In [None]:
# Missing Values Percentage
telecom_df_null_check = 100*telecom.loc[:,amt_recharge_columns].isnull().sum()/len(telecom)
df = pd.DataFrame(telecom_df_null_check)
df.rename(columns={0:'Null_Percentage'}, inplace=True)
df = pd.DataFrame(df.Null_Percentage)
display(df)

`Inference:`
- We can see that there are columns with more than 70% missing values.

In [None]:
telecom.loc[:,amt_recharge_columns].describe()

`Inference:`
- As the minimum value is 1, we can impute NA values by 0, considering no recharge was done.

In [None]:
# Replacing Null values with 0
telecom['total_rech_data_6'] = telecom['total_rech_data_6'].replace(np.NaN,0.0)
telecom['total_rech_data_7'] = telecom['total_rech_data_7'].replace(np.NaN,0.0)
telecom['total_rech_data_8'] = telecom['total_rech_data_8'].replace(np.NaN,0.0)

In [None]:
# Replacing Null values with 0
telecom['av_rech_amt_data_6'] = telecom['av_rech_amt_data_6'].replace(np.NaN,0.0)
telecom['av_rech_amt_data_7'] = telecom['av_rech_amt_data_7'].replace(np.NaN,0.0)
telecom['av_rech_amt_data_8'] = telecom['av_rech_amt_data_8'].replace(np.NaN,0.0)

In [None]:
# 'total_rech_amt_data' for calculating High Value customer process
telecom['total_rech_amt_data_6'] = telecom.av_rech_amt_data_6 * telecom.total_rech_data_6
telecom['total_rech_amt_data_7'] = telecom.av_rech_amt_data_7 * telecom.total_rech_data_7
telecom['total_rech_amt_data_8'] = telecom.av_rech_amt_data_8 * telecom.total_rech_data_8

In [None]:
# Average recharge amount for the Good Phase
telecom['total_avg_rech_amt_6_7_GPhase'] = (telecom.total_rech_amt_6 + telecom.total_rech_amt_data_6 \
                                               + telecom.total_rech_amt_7+ telecom.total_rech_amt_data_7)/2

# 3. Filtering High Value Customers
People who have done recharge of amount more than the 70th percentile of average recharge amount.

In [None]:
# create a filter for values greater than 70th percentile of total average recharge amount for good phase 
high_value_filter = telecom.total_avg_rech_amt_6_7_GPhase.quantile(0.7)
print('70th Percentile:', high_value_filter)

telecom_high_val_cust = telecom[telecom.total_avg_rech_amt_6_7_GPhase > high_value_filter]
print('Shape of Telecom High Value Customers:', telecom_high_val_cust.shape)

### Tag Churners and removal of attributes of churn phase

- Now tagging the churned customers (churn=1, else 0) based on the fourth month as follows: Those who have not made any calls and have not used mobile internet even once in the churn phase.
- The attributes we need to use to tag churners are:total_ic_mou_9, total_og_mou_9, vol_2g_mb_9, vol_3g_mb_9
- After tagging churners, remove all the attributes corresponding to the churn phase (all attributes having ‘ _9’, etc. in their names).

In [None]:
high_val_cust_9 = ['total_ic_mou_9', 'total_og_mou_9', 'vol_2g_mb_9','vol_3g_mb_9']

In [None]:
# Missing Value Percentage
df = 100*telecom_high_val_cust.loc[:,high_val_cust_9].isnull().sum()/len(telecom_high_val_cust)
df = pd.DataFrame(df)
df.rename(columns={0:'Null_Percentage'}, inplace=True)
df = pd.DataFrame(df.Null_Percentage)
display(df)

In [None]:
# Initialising 'churn' variable as 0
telecom_high_val_cust['churn']= 0

In [None]:
# Using is_churned boolean flag for customers where above values are zero
is_churned = (telecom_high_val_cust.total_ic_mou_9 == 0) & \
             (telecom_high_val_cust.total_og_mou_9 == 0) & \
             (telecom_high_val_cust.vol_2g_mb_9 ==0) & \
             (telecom_high_val_cust.vol_3g_mb_9 ==0)

In [None]:
# Turning all the churned from 0 to 1
telecom_high_val_cust.loc[is_churned,'churn']=1

In [None]:
# Checking percentage of churned customers
print('Churn Percentage:', round((telecom_high_val_cust.churn.sum()/len(telecom_high_val_cust))*100, 2))

In [None]:
# Removing attributes containing '_9'
churn_month_columns =  telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('_9')]

telecom_high_val_cust.drop(churn_month_columns,axis=1,inplace=True)

# 4. Exploratory Data Analysis

In [None]:
list_potential_categorical_type(get_info(telecom_high_val_cust),telecom_high_val_cust)

In [None]:
# Dropping columns with one unique value
drop_col_with_unique_col =['circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 
                          'last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8', 
                          'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 
                          'std_og_t2c_mou_8',  'std_ic_t2o_mou_6', 
                          'std_ic_t2o_mou_7', 'std_ic_t2o_mou_8']

telecom_high_val_cust.drop(drop_col_with_unique_col,axis=1,inplace=True)

print('Shape:', telecom_high_val_cust.shape)

In [None]:
info_df = get_info(telecom_high_val_cust)

In [None]:
# Date Columns
info_df_count = info_df[info_df["Datatype"] == 'object']
date_col = info_df_count.index.tolist()
date_col

In [None]:
telecom_high_val_cust[date_col].head(5)

In [None]:
for col in date_col:
    telecom_high_val_cust[col] = pd.to_datetime(telecom_high_val_cust[col])

In [None]:
telecom_high_val_cust.head()

In [None]:
list_potential_categorical_type(get_info(telecom_high_val_cust),telecom_high_val_cust)

In [None]:
# Defining function for boxplots for months 6-8
def plot_box_chart(attribute):
    plt.figure(figsize=(20,16))
    df = telecom_high_val_cust
    plt.subplot(2,3,1)
    sns.boxplot(data=df, y=attribute+"_6",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,3,2)
    sns.boxplot(data=df, y=attribute+"_7",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,3,3)
    sns.boxplot(data=df, y=attribute+"_8",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.show()

In [None]:
# Recharge Amount Columns
recharge_amnt_columns =  telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('rech_amt')]
recharge_amnt_columns.tolist()

In [None]:
# total recharge amount:
plot_box_chart('total_rech_amt')

`Inference:`
- We can see a drop of Recharge amount for the churned customers in the 8th month

In [None]:
# total recharge amount for data:
plot_box_chart('total_rech_amt_data')

`Inference:`
- We can see a drop of Recharge amount for data for the churned cutsomers in the 8th month.

In [None]:
# maximum recharge amount for data:
plot_box_chart('max_rech_amt')

`Inference:`
- We can see a drop of Max Recharge amount for the churned cutsomers in the 8th month.

In [None]:
# Checking other recharge related variables:
other_recharge = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('rech_num|max_rech_data',regex=True)]

In [None]:
# Null Value Percentage
(telecom_high_val_cust.loc[:,other_recharge].isnull().sum()/len(telecom_high_val_cust))*100

In [None]:
telecom_high_val_cust.loc[:,['max_rech_data_6','max_rech_data_7','max_rech_data_8']].describe()

`Inference:`
- Minimum recharge value is 1. So, we can fill the missing values with 0 indicating that recharge didn't happen.

In [None]:
# Replacing NA's with 0
telecom_high_val_cust.loc[:,['max_rech_data_6','max_rech_data_7','max_rech_data_8']] \
= telecom_high_val_cust.loc[:,['max_rech_data_6','max_rech_data_7','max_rech_data_8']].replace(np.nan,0)

In [None]:
# Total recharge for Number:
plot_box_chart('total_rech_num')

`Inference:`
- We can see a drop of total Recharge number for the churned cutsomers in the 8th month.

In [None]:
# maximum recharge for data:
plot_box_chart('max_rech_data')

`Inference:`
- We can see a drop of Max Recharge data for the churned cutsomers in the 8th month.

In [None]:
# last day recharge amount columns
last_day_rech_amt = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('day')]
last_day_rech_amt.tolist()

In [None]:
# last day recharge amount:
plot_box_chart('last_day_rch_amt')

`Inference:`
- We can see a huge drop in the 8th month

In [None]:
usage_2g_and_3g = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('2g|3g',regex=True)]

In [None]:
# Missing Value Percentage
percentage_3g_2g_null_check = 100*telecom_high_val_cust.loc[:,usage_2g_and_3g].isnull().sum()/len(telecom_high_val_cust.loc[:,usage_2g_and_3g])
df = pd.DataFrame(percentage_3g_2g_null_check)
df.rename(columns={0:'Null_Percentage'}, inplace=True)
df = pd.DataFrame(df.Null_Percentage)
display(df)

`Inference:` Dropping the columns with more than 40% missing values

In [None]:
# Dropping
telecom_high_val_cust.drop(['count_rech_2g_6','count_rech_2g_7',
              'count_rech_2g_8','count_rech_3g_6','count_rech_3g_7','count_rech_3g_8'
               ,'arpu_3g_6','arpu_3g_7','arpu_3g_8','arpu_2g_6','arpu_2g_7','arpu_2g_8'],axis=1,inplace=True)

In [None]:
# 2G usage
plot_box_chart('vol_2g_mb')

In [None]:
# 3G usage
plot_box_chart('vol_3g_mb')

`Inference:`
- 2G and 3G usage for churned customers drops in the 8th month.

In [None]:
# 2G/3G service schemes
monthly_subcription_2g_3g = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('monthly_2g|monthly_3g',regex=True)]
monthly_subcription_2g_3g.tolist()

In [None]:
telecom_high_val_cust[monthly_subcription_2g_3g].info()

In [None]:
# Monthly subscription :
plot_box_chart('monthly_2g')

In [None]:
# Plotting mean bar chart since, boxplot is not suitable here.
def plot_mean_bar_chart(df,columns_list):
    df_0 = df[df.churn==0].filter(columns_list)
    df_1 = df[df.churn==1].filter(columns_list)

    mean_df_0 = pd.DataFrame([df_0.mean()],index={'Non Churn'})
    mean_df_1 = pd.DataFrame([df_1.mean()],index={'Churn'})

    frames = [mean_df_0, mean_df_1]
    mean_bar = pd.concat(frames)

    mean_bar.T.plot.bar(figsize=(10,5),rot=0)
    plt.show()
    
    return mean_bar

In [None]:
plot_mean_bar_chart(telecom_high_val_cust, monthly_subcription_2g_3g)

`Inference:`
- We can see a drop in monthly subscription for churned customers in 8th month.

In [None]:
# Volume based cost 
vbc_column = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('vbc_',regex=True)]
vbc_column.tolist()

In [None]:
# Renaming month named vbc columns to 6,7,8,9 format
telecom_high_val_cust.rename(columns={'jun_vbc_3g':'vbc_3g_6','jul_vbc_3g':'vbc_3g_7','aug_vbc_3g':'vbc_3g_8'
                         ,'sep_vbc_3g':'vbc_3g_9'}, inplace=True)

# Drop 9th month column as it is not needed
telecom_high_val_cust.drop('vbc_3g_9',axis=1,inplace=True)

In [None]:
vbc_column = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('vbc_3g',regex=True)]
vbc_column.tolist()

In [None]:
# Volume based cost :
plot_box_chart('vbc_3g')

In [None]:
plot_mean_bar_chart(telecom_high_val_cust, vbc_column)

`Inference:`
- We can see that volume based cost for 3G is much lower for Churned customers as compared to Non-Churn Customers

In [None]:
# Checking Service schemes with validity smaller than a month for 2G/3G
SC_2g_or_3g_col = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('sachet_2g|sachet_3g',regex=True)]

In [None]:
plot_mean_bar_chart(telecom_high_val_cust, SC_2g_or_3g_col)

`Inference:` We can see the drop in sachet services for churned customers in 8th month.

In [None]:
# Checking columns for average revenue per user
arpu_cols = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('arpu_')]

# Plotting arpu
plot_box_chart('arpu')

In [None]:
plot_mean_bar_chart(telecom_high_val_cust, arpu_cols)

`Inference:`
- We can see huge drops for Arpu in 8th month

In [None]:
# Minutes of Usage
mou_cols = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('mou')]
mou_cols

In [None]:
info_df = get_info(telecom_high_val_cust[mou_cols])
info_df.sort_values(["Null_Percentage"], axis=0,ascending=False, inplace=True)
col_to_display = ['Null_Percentage']
info_df[col_to_display]

`Inference:`
- Missing value in mou columns is more than 3%.

In [None]:
# Replaceing null values by 0 for mou variables
telecom_high_val_cust.loc[:,mou_cols] = telecom_high_val_cust.loc[:,mou_cols].replace(np.NaN,0)

In [None]:
mou_og_cols6 = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('.*_og_.*mou_6',regex=True)]

In [None]:
telecom_high_val_cust.loc[:,mou_og_cols6].describe()

In [None]:
# Correlation matrix
heatmap(18,12,telecom_high_val_cust.filter(mou_og_cols6))

`Inference:`
- We can see that total_og_mou_6, std_og_mou_6 and loc_og_mou_6 have high correlations with some variables and they should be inspected.

In [None]:
# Dropping some columns
list_total_og_cols = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('total_og_mou|std_og_mou|loc_og_mou',regex=True)]
telecom_high_val_cust.drop(list_total_og_cols,axis=1,inplace=True)
print('Columns Dropped:')
list_total_og_cols.tolist()

In [None]:
# Checking incoming mou variables
mou_ic_cols6 = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('.*_ic_.*mou_6',regex=True)]
telecom_high_val_cust.loc[:,mou_ic_cols6].describe()

In [None]:
# Correlation matrix for mou 6th month
heatmap(18,12,telecom_high_val_cust.filter(mou_ic_cols6))

`Inference:`
- We can see that total_ic_mou_6, std_ic_mou_6 and loc_ic_mou_6 have high correlations with some variables and they should be inspected.

In [None]:
# Dropping columns ic_mou since they are a combination of other variables present in the data.
list_total_ic_cols = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('total_ic_mou|std_ic_mou|loc_ic_mou',regex=True)]
telecom_high_val_cust.drop(list_total_ic_cols,axis=1,inplace=True)
print('Columns Dropped:')
list_total_ic_cols.tolist()

In [None]:
# Offnet Usage
offnet_usage_service_col = telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('offnet.*mou',regex=True)]

In [None]:
# Offnet mou for churned v/s non-churned
plot_box_chart('offnet_mou')

In [None]:
plot_mean_bar_chart(telecom_high_val_cust, offnet_usage_service_col)

`Inference:` We can see the drop in the 8th month.

In [None]:
# Onnet
onnet_usage_service =  telecom_high_val_cust.columns[telecom_high_val_cust.columns.str.contains('^onnet.*mou',regex=True)]
onnet_usage_service

In [None]:
# Onnet
plot_box_chart('onnet_mou')

In [None]:
plot_mean_bar_chart(telecom_high_val_cust, onnet_usage_service)

In [None]:
# Tenure Analysis
tenure_data = telecom_high_val_cust.copy()

In [None]:
plt.figure(figsize=(14,8))
# Age on network: aon - number of days the customer is using the operator T network
tenure_data['tenure'] = tenure_data['aon']/30
tenure_data['tenure'].head()
ax = sns.distplot(tenure_data['tenure'], hist=True, kde=False, 
             bins=int(180/5), color = 'red', 
             hist_kws={'edgecolor':'yellow'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('Number of Customers')
ax.set_xlabel('Tenure in Months')
ax.set_title('Customers Vs Tenure')
plt.show()

In [None]:
# Binning
tn_range = [0, 6, 12, 24, 60, 61]
tn_label = ['0-6 Months', '6-12 Months', '1-2 Yrs', '2-5 Yrs', '5 Yrs and above']
tenure_data['tenure_range'] = pd.cut(tenure_data['tenure'], tn_range, labels=tn_label)
tenure_data['tenure_range'].head()

In [None]:
plt.figure(figsize=(14,8))
sns.countplot(x = 'tenure_range', hue = 'churn',data = tenure_data,palette=("gist_rainbow"))
plt.show()

In [None]:
# Checking columns with more than 30% missing values
info_df = get_info(telecom_high_val_cust)
info_df = info_df[info_df["Null_Percentage"] > 30]
info_df.sort_values(["Null_Percentage"], axis=0,ascending=False, inplace=True)
col_to_display = ['Null_Percentage']
info_df[col_to_display]

In [None]:
# Dropping columns with more than 30% missing values
is_more_30 = info_df.index
telecom_high_val_cust.drop(telecom_high_val_cust[is_more_30],axis=1,inplace=True)
print('Shape:', telecom_high_val_cust.shape)

In [None]:
# Missing Value Percentage
info_df = get_info(telecom_high_val_cust)
info_df = info_df[info_df["Null_Percentage"] > 0]
info_df.sort_values(["Null_Percentage"], axis=0,ascending=False, inplace=True)
col_to_display = ['Null_Percentage']
info_df[col_to_display]

In [None]:
# Exploring Numeric Fields
num_col = ['og_others_8', 'ic_others_8', 'og_others_6','ic_others_6', 'og_others_7', 'ic_others_7']

In [None]:
# Replacing Missing Value with 0
for i in num_col:
    telecom_high_val_cust.loc[telecom_high_val_cust[i].isnull(),i]=0

In [None]:
# Missing Value Percentage
info_df = get_info(telecom_high_val_cust)
info_df = info_df[info_df["Null_Percentage"] > 0]
info_df.sort_values(["Null_Percentage"], axis=0,ascending=False, inplace=True)
col_to_display = ['Null_Percentage']
info_df[col_to_display]

# 5. Feature Engineering

In [None]:
telecom_df_final = telecom_high_val_cust.copy()
print('Shape:', telecom_df_final.shape)

In [None]:
telecom_df_final.drop(["total_avg_rech_amt_6_7_GPhase"],axis=1,inplace=True)
telecom_df_final.drop(telecom_high_val_cust.filter(regex='date_').columns,axis=1,inplace=True)
print (telecom_df_final.shape)

col_list = telecom_df_final.filter(regex='_6|_7').columns.str[:-2]
for idx, col in enumerate(col_list.unique()):
    print(col)
    avg_col_name = "avg_"+col+"_av67"
    col_6 = col+"_6"
    col_7 = col+"_7"
    telecom_df_final[avg_col_name] = (telecom_df_final[col_6]  + telecom_df_final[col_7])/ 2

In [None]:
col_list_to_drop = telecom_df_final.filter(regex='_6|_7')
telecom_df_final.drop(col_list_to_drop,axis=1,inplace=True)
print ('Shape:', telecom_df_final.shape)

`Inference:`
- We have merged most of the columns of 6th and 7th month since they have similar performance and also to reduce the number of features.

# 6. Modelling

## 6.1. PCA: Principal Component Analysis

In [None]:
telecom_pca_df = telecom_df_final.copy()

In [None]:
# Defining some customised functions

def scale_data(X):
    scaler = StandardScaler()
    X_col = X.columns
    X_scaled = scaler.fit_transform(X)
    X_scale_final = pd.DataFrame(X_scaled, columns=X_col)
    return X_scale_final

def get_scree_plot(X):
    pca = PCA(svd_solver='randomized', random_state=101)
    pca.fit(X)
    fig = plt.figure(figsize = (8,6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance')
    plt.show()
    return pca
    
    
def resampling_data(X,Y,feature_number,train_size=0.7,test_size=0.3) :
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, train_size=train_size, random_state=1)
    print('Shape of train dataset before PCA:', str(x_train.shape))
    print('Percentage of Churned customer data:', str(round(100*Y.sum()/len(Y), 2)))
    print('Aplying SMOTE to normalize imbalace.')

    #Initializing SMOTE: Synthetic Minority Over-sampling Technique
    # using this as data is skewed 
    smote = SMOTE(kind = "regular")
    x_tr,y_tr = smote.fit_sample(x_train,y_train)
    print('Shape of train dataset after SMOTE: ', str(x_tr.shape))
    
    # Applying PCA : Pricnipal Component Analysis
    pca = IncrementalPCA(n_components=feature_number)    
    x_tr_pca = pca.fit_transform(x_tr)
    x_test_pca = pca.transform(x_test)
    
    print('Shape of train dataset after PCA:', str(x_train.shape))
    
    return x_tr_pca,x_test_pca,y_tr,y_test

In [None]:
Y = telecom_pca_df["churn"]
X = telecom_pca_df.drop(["churn","mobile_number"],axis=1)

In [None]:
print('Customer Churn Percentage:', round((Y.sum()/len(Y))*100,2), '%')

In [None]:
# Churn Distribution
pie_chart = telecom_pca_df['churn'].value_counts()*100.0 /len(telecom_pca_df)
ax = pie_chart.plot.pie(autopct='%.1f%%', labels = ['No', 'Yes'],figsize =(8,6), fontsize = 14 )                                                                           
ax.set_ylabel('Churn',fontsize = 12)
ax.set_title('Churn Distribution', fontsize = 12)
plt.show()

In [None]:
# Scaling the data
X_scaled = scale_data(X)
print(X_scaled.shape)
X_scaled.head(5)

In [None]:
pca = get_scree_plot(X_scaled) # scree plot

col = list(X_scaled.columns)
df_pca = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'PC3':pca.components_[2],'Feature':col})
df_pca.head(10)

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

`Inference:` We can see that 60 components are enough to explain 95% of the variance.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,Y,train_size = 0.7)
X_train.shape

In [None]:
# Defining a custom function to print model metrics
def print_model_metrics(y_test,y_pred,model_name):
    print('Model Stats Score Summary: ')
    cp = confusion_matrix(y_test,y_pred)
    plt.figure()
    confusion_matrix_plot(cp)
    plt.show()
    
    accuracy = round(accuracy_score(y_test,y_pred),2)
    recall = round(recall_score(y_test,y_pred),2)
    precision = round(precision_score(y_test,y_pred),2)
    auc = round(roc_auc_score(y_test,y_pred),2)
    f1 = round(f1_score(y_test,y_pred),2)
    
    data = [[model_name,accuracy,recall,precision,auc,f1]] 
    df = pd.DataFrame(data, columns = ['Model', 'Accuracy','Precision','Recall','AUC','F1'])
    add_to_global_summary(df)
    return df 

def confusion_matrix_plot(cm):
    classes=[0,1]
    cmap=plt.cm.Blues
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

model_summary = pd.DataFrame()

def add_to_global_summary(df) :
    global model_summary 
    model_summary = model_summary.append(df,ignore_index=True)

## 6.2. Logistic Regression Model

In [None]:
# Training the model
lr = LogisticRegression()
model = lr.fit(X_train,y_train)
#Making prediction on the test data
pred_probs_test = model.predict_proba(X_test)[:,1]

In [None]:
# Logistic Regression Accuracy
print('Accuracy:', metrics.roc_auc_score(y_test, pred_probs_test))

## 6.3. RandomForest Model

In [None]:
# Creating a backup copy
telecom_tree_df = telecom_df_final.copy()

In [None]:
telecom_tree_df.head()

In [None]:
# Splitting the data into X and y
y = telecom_tree_df["churn"]
X = telecom_tree_df.drop(["churn","mobile_number"],axis=1)

In [None]:
# Splittimg into Train and Test
X_train, X_test, y_train, y_test = train_test_split(scale_data(X), y, train_size=0.70, random_state=100)

In [None]:
# Fitting the train and test
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

In [None]:
# Predictions
y_pred_test = rfc.predict(X_test)

In [None]:
print_model_metrics(y_test,y_pred_test,"RandomForest (Default)")

#### Hyperparameter Tuning

In [None]:
# GridSearch
param_grid = {
    'max_depth': [12,18],
    'n_estimators': [200],
    'max_features': [40],
    'min_samples_leaf': [10, 20],
    'min_samples_split': [10,20,30]
}

# Initializing the model
rf = RandomForestClassifier()
# Instantiate the grid search model
rf_grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1,verbose = 1,return_train_score=True)

`Note:` After some testing, we have arrived to the above parameters. Here, it is not shown because it takes a lot of time in running that code.

In [None]:
# Fitting the grid search to the data
rf_grid_search.fit(X_train, y_train)

In [None]:
# Accuracy Score
print('Accuracy Score:',round(rf_grid_search.best_score_, 2))

#### Building and Evaluating Final model for Random Forest

In [None]:
rfc = RandomForestClassifier(max_depth=18,
                             max_features=40,
                             min_samples_leaf=10,
                             min_samples_split=20,
                             n_estimators=200,
                             n_jobs = -1)
rfc.fit(X_train,y_train)

In [None]:
# Predictions
y_pred_test = rfc.predict(X_test)

In [None]:
# Confusion Matrix
print_model_metrics(y_test,y_pred_test,'RandomForest (Hyper)')

## 6.4. XGBOOST

In [None]:
# Creating a backup copy to run on this model
telecom_xgboost_df = telecom_df_final.copy()

In [None]:
# Splitting into X and y
y = telecom_xgboost_df["churn"]
X = telecom_xgboost_df.drop(["churn","mobile_number"],axis=1)

In [None]:
# Splitting into Train and Test
X_train, X_test, y_train, y_test = train_test_split(scale_data(X), y, train_size=0.7, random_state=100)

In [None]:
# Fitting the Training data on XGBOOST model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred_test = xgb_model.predict(X_test)

In [None]:
# Model Metrics and Accuracy Score
print_model_metrics(y_test, y_pred_test ,'XGBoost (Default)')

#### Hyperparameteric Tuning

In [None]:
# KFold 
folds = 5

# Hyperparameters
param_grid = {'learning_rate': [0.1,0.2,0.3], 
             'subsample': [0.3,0.4,0.5]}          


# Model Initialization
xgb_model = XGBClassifier(max_depth=2, n_estimators=200)

# GridSearchCV()
model_cv = GridSearchCV(estimator = xgb_model, 
                        param_grid = param_grid, 
                        scoring= 'accuracy', # accuracy
                        cv = folds, 
                        n_jobs = -1,
                        verbose = 1,
                        return_train_score=True)  

In [None]:
# Fitting the model
model_cv.fit(X_train, y_train)    

In [None]:
# Cross-Validation Results
cv_results_xgboost = pd.DataFrame(model_cv.cv_results_)
cv_results_xgboost['param_learning_rate'] = cv_results_xgboost['param_learning_rate'].astype('float')

In [None]:
# Accuracy Score
print('Accuracy Score:', round(model_cv.best_score_, 2))

In [None]:
# Customized function for plotting XGBOOST
def plot_for_xboost(param_grid,cv_results):
    plt.figure(figsize=(18,5))
    for n, subsample in enumerate(param_grid['subsample']):
        # subplot 1/n
        plt.subplot(1,len(param_grid['subsample']), n+1)
        df = cv_results[cv_results['param_subsample']==subsample]

        plt.plot(df["param_learning_rate"], df["mean_test_score"])
        plt.plot(df["param_learning_rate"], df["mean_train_score"])
        plt.xlabel('learning_rate')
        plt.ylabel('AUC')
        plt.title("subsample={0}".format(subsample))
        plt.ylim([0.60, 1])
        plt.legend(['test score', 'train score'], loc='lower right')
        plt.xscale('log')

In [None]:
param_grid1 = {'learning_rate': [0.1,0.2,0.3], 'subsample': [0.3,0.4,0.5]}  
plot_for_xboost(param_grid1,cv_results_xgboost)

`Inference:`
- Results show that subsample size of 0.5 and learning_rate of 0.3 seems optimal.

In [None]:
# Chosen Hyperparameters
# 'objective':'binary:logistic' outputs probability rather than label, which we need for auc
params = {'learning_rate': [0.3],
          'max_depth': [2], 
          'n_estimators':[200],
          'subsample':[0.5],
         'objective':['binary:logistic']}

# fit model on training data
model = XGBClassifier(params = params,max_depth=2, n_estimators=200)
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred_test = model.predict(X_test)

In [None]:
print_model_metrics(y_test, y_pred_test,'XGBoost (Hyper Tuned)')

In [None]:
# Plotting Feature Importances
plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x = 'tenure_range', hue = 'churn',data = tenure_data,palette=("gist_rainbow"))
plt.show()

print('Important Indicators:')
col = list(X_scaled.columns)
df_pca = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'PC3':pca.components_[2],'Feature':col})
df_pca.head(15)

# 7. Recommendations

`Inference:`
- Less number of 'High Value Customers' are churning but for last 6 month, no new high valued customer is onboarded which the company should look into.
- Customers with less than 4 years of tenure are more likely to churn.
- Average Revenue per user is the most important feature.
- Incoming and Outgoing Calls in 8th month should be focused on because if their usage is decreased, they are more likely to churn.
- 2G/3G services should be improved.

In [None]:
model_summary

`Inference:`
- RandomForest produce good accuracy of 0.95.
- XGBOOST after tuning also produce an accuracy of 0.95.
- We should prefer XGBOOST with parameters tuned for future predictions of the model.