# Customer Churn Prediction using Telecom dataset

The Orange Telecom's Churn Dataset consists of cleaned customer activity data (features), along with a churn label specifying whether a customer canceled the subscription.

The data can be downloaded from follwing link: https://www.kaggle.com/datasets/mnassrib/telecom-churn-datasets

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve, plot_precision_recall_curve

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

%matplotlib inline

seed = 42

In [None]:
train_data_path = 'Orange_telecom/churn-bigml-80.csv'
test_data_path = 'Orange_telecom/churn-bigml-20.csv'

data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [None]:
data.head()

## Data Wrangling

### Drop columns that are not likely to be used in the prediction

In [None]:
# No. of unique values in each column
data.nunique()

- Since **State** column has 51 unique values encoding it would just increase the dataset dimension. Hence it can be dropped.
- **Area code** also seems to be irrelevant to the target variable assuming that there is no geographic factor influencing the target variable

In [None]:
data = data.drop(['State', 'Area code'], axis=1)

### Check the data types of each column and appropriately encode categorical columns

In [None]:
data.info()

In [None]:
# Print unique values in each categorical columns
cat_columns = data.select_dtypes('object').columns
for col in cat_columns:
    print(f"\n Column: {col} \n")
    print(data[col].unique())

- Encode yes and no values as 1 and 0 respectively in **International plan** and **Voice mail plan** columns.
- Convert the **Churn** column from boolean to integer data type

In [None]:
data['International plan'] = data['International plan'].map({'Yes':1, 'No':0})
data['Voice mail plan'] = data['Voice mail plan'].map({'Yes':1, 'No':0})

In [None]:
data['Churn'] = data['Churn'].astype(int)

## Exploratory Data Analysis

### Distribution of Churn variable

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Churn', data=data)
plt.show()

The target variable seems to be highly imbalanced with data of mostly non-churned customers.

### Split the train data into train and validation sets

Since the data is imbalanced a stratified train and validation split is to be performed

In [None]:
X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=seed)

### Distribution of predictor variables

In [None]:
cat_cols = ['International plan', 'Voice mail plan']
num_cols = [val for val in X.columns if val not in cat_cols]

In [None]:
for column in cat_cols:
    plt.figure(figsize=(10,8))
    sns.countplot(x=column, data=X)
    plt.show()

In [None]:
for column in num_cols:
    plt.figure(figsize=(10,8))
    sns.distplot(X[column])
    plt.show()

- It seems most the numerical variables are normally distributed

## Train Classification Models

### Random Forest Classifier

In [None]:
num_trees = [50, 125, 100, 150, 200, 250, 300]
rfc_oob_score = []
for val in num_trees:
    rfc_model = RandomForestClassifier(n_estimators=val, oob_score=True, random_state=seed)
    rfc_model.fit(X_train, y_train)
    rfc_oob_score.append(rfc_model.oob_score_)

In [None]:
plt.plot(rfc_oob_score)
plt.xlabel('No. of trees')
plt.ylabel('OOB Score')
plt.title('Random Forest Claasifier')
plt.xticks(range(len(num_trees)), num_trees)
plt.show()

### Extra Tree Classifier

In [None]:
etc_oob_score = []
for val in num_trees:
    etc_model = ExtraTreesClassifier(n_estimators=val, bootstrap=True, oob_score=True, random_state=seed)
    etc_model.fit(X_train, y_train)
    etc_oob_score.append(etc_model.oob_score_)

In [None]:
plt.plot(etc_oob_score)
plt.xlabel('No. of trees')
plt.ylabel('OOB Score')
plt.title('Extra Tree Classifier')
plt.xticks(range(len(num_trees)), num_trees)
plt.show()

In [None]:
plt.plot(rfc_oob_score, label='RFC')
plt.plot(etc_oob_score, label='ETC')
plt.xlabel('No. of trees')
plt.ylabel('OOB Score')
plt.legend()
plt.xticks(range(len(num_trees)), num_trees)
plt.show()

In [None]:
rfc_model = RandomForestClassifier(n_estimators=200, random_state=seed)
rfc_model.fit(X_train, y_train)

In [None]:
y_val_pred = rfc_model.predict(X_val)

In [None]:
print('\n CLassification Report: \n')
print(classification_report(y_val, y_val_pred))

In [None]:
plt.figure(figsize=(8,8))
s = sns.heatmap(confusion_matrix(y_val, y_val_pred),
                annot=True, 
                xticklabels=['Not Churned', 'Churned'], 
                yticklabels=['Not Churned', 'Churned'], 
                fmt="d")
s.set(xlabel='Predicted Labels', ylabel='True Labels')
plt.show()

Since the predict variable is highly imbalanced its better to evaluate the model based on the F1-Score rather than the accuracy.

In [None]:
plt.figure(figsize=(10,8))
plot_roc_curve(rfc_model, X_val, y_val)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plot_precision_recall_curve(rfc_model, X_val, y_val)
plt.show()

In [None]:
feat_imp = pd.DataFrame({'Importance':rfc_model.feature_importances_, 'Feature':X_train.columns})
feat_imp = feat_imp.sort_values('Importance', ascending=False)
feat_imp.head()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='Importance', y='Feature', data=feat_imp, orient='h')
plt.show()

In [None]:
# Preparing pipeline to deal with Class Imbalance

pipeline = Pipeline(steps=[['smote', SMOTE(random_state=seed)], 
                           ['underSample', RandomUnderSampler(random_state=seed)], 
                           ['classifier', RandomForestClassifier(random_state=seed)]]
                   )

param_dict = {'smote__sampling_strategy':[0.2,0.3], 
              'underSample__sampling_strategy':[0.6, 0.7], 
              'classifier__n_estimators':[50, 75, 100, 150, 200], 
              'classifier__max_depth':[None, 5, 7, 15, 20], 
              'classifier__max_features':['auto', 0.5, 0.75]}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring='recall')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.score(X_val, y_val)