![](https://miro.medium.com/max/1200/1*WqId29D5dN_8DhiYQcHa2w.png)

**We will be looking into churn modeling dataset. I will walkthrough with Undersampling and Oversampling Techniques. Please Upvote, if you like my notebook. :)**

# Import Libraries

In [None]:
#basic fundamental libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Stratified sampling, SMOTE for oversampling technique, make_pipeline for pipeline creation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline

#color text + Statistics libraires
from scipy.stats import chi2_contingency
import colorama
from colorama import Fore
import scipy.stats as stats
import statsmodels.api as sma

from sklearn.tree import DecisionTreeClassifier

#best parameter finding libraries
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV 

#metrics
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score, accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

# Read Dataset

In [None]:
data = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
data.head()

# Data Preparation

In [None]:
#deleting these rows as total charges have space in between which is leading to the string category.
data = data[data['TotalCharges'] != ' ']

#Multiple lines, Online Security, Online Backup, Device Protection, Tech support, Streaming Service, Streaming Movies, 
data['MultipleLines'] = data['MultipleLines'].replace("No phone service", "No")
data['OnlineSecurity'] = data['OnlineSecurity'].replace("No internet service", "No")
data['OnlineBackup'] = data['OnlineBackup'].replace("No internet service", "No")
data['DeviceProtection'] = data['DeviceProtection'].replace("No internet service", "No")
data['TechSupport'] = data['TechSupport'].replace("No internet service", "No")
data['StreamingTV'] = data['StreamingTV'].replace("No internet service", "No")
data['StreamingMovies'] = data['StreamingMovies'].replace("No internet service", "No")

#converting senior citizen into category
data['SeniorCitizen'] = data['SeniorCitizen'].replace(1,"Yes")
data['SeniorCitizen'] = data['SeniorCitizen'].replace(0,"No")
data['TotalCharges'] = data['TotalCharges'].astype(float)
data.drop('customerID', axis = 1, inplace= True)

In [None]:
data.info()

# Categorical Features: Significance Check

In [None]:
cat_cols = [col for col in data.columns if data[col].dtype=="object"]
cat_cols

# Chi Square Siginifcance Test For Categorical Variables

In [None]:
def chisq(df,name):
    stat, p, df, arr = chi2_contingency(pd.crosstab(df, data.Churn))
    if p < 0.05:
     print(Fore.GREEN +'We can reject the Null Hypothesis for {} category {}'.format(name, p))
    else:
     print(Fore.RED +'We fail to reject the Null Hypothesis for {} category {}'.format(name, p))

In [None]:
for i in cat_cols:
    chisq(data[i],i)

**Analysis**
- Removing those insignificant columns

In [None]:
data.drop(['gender', 'PhoneService'], axis = 1, inplace = True)

In [None]:
temp = data
temp["Churn"] =temp["Churn"].replace("Yes", 1)
temp["Churn"] =temp["Churn"].replace("No", 0)
fig = px.parallel_categories(temp, dimensions=['SeniorCitizen', 'Partner', 'Dependents',
                                               'Contract', 'PaperlessBilling', 'PaymentMethod'],
                color="Churn", color_continuous_scale=px.colors.sequential.Inferno,
                labels={'SeniorCitizen':'Senior Citizen', 'Partner':'Partner',
                        'Dependents':'Dependents', 'Contract':'Contract',
                        'PaperlessBilling':'Paperless Billing', 'PaymentMethod':'PaymentMethod'})
fig.show()

# Numerical Features

In [None]:
num_cols = [col for col in data.columns if data[col].dtype=="int64" or data[col].dtype=="float64"]
num_cols

In [None]:
data.hist(figsize = (20,10), layout = (2,4))

# Skewness

In [None]:
data.skew().sort_values(ascending = False)

**Analysis**
- Total Charges column is positively skewed.

In [None]:
#did transformation on Total Charges column
data['TotalCharges'] = np.sqrt(data['TotalCharges'])
data.skew().sort_values(ascending = False)

In [None]:
data.hist(figsize = (20,10), layout = (2,4))

In [None]:
corr = data.corr()
plt.figure(figsize = (10,10))
ax = sns.heatmap(corr, vmin = -1, center = 0, annot = True, cmap = 'mako')

# Outlier Analysis

In [None]:
#tenure
tenure_data = data['tenure'].values
q25, q75 = np.percentile(tenure_data, 25), np.percentile(tenure_data, 75) #q1 and a3 assigned
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
tenure_iqr = q75 - q25
print('iqr: {}'.format(tenure_iqr))

tenure_cut_off = tenure_iqr * 1.5
tenure_lower, tenure_upper = q25 - tenure_cut_off, q75 + tenure_cut_off
print('Cut Off: {}'.format(tenure_cut_off))
print('tenure Lower Band: {}'.format(tenure_lower))
print('tenure Upper band: {}'.format(tenure_upper))

tenure_outliers = [x for x in tenure_data if x < tenure_lower or x > tenure_upper]
print('Feature tenure Outliers Total: {}'.format(len(tenure_outliers)))
print('tenure outliers:{}'.format(tenure_outliers))

data = data.drop(data[(data['tenure'] > tenure_upper) | (data['tenure'] < tenure_lower)].index)

**Analysis**
- No outliers in Tenure Column

In [None]:
#Outlier Analysis
#MonthlyCharges
MonthlyCharges_data = data['MonthlyCharges'].values
q25, q75 = np.percentile(MonthlyCharges_data, 25), np.percentile(MonthlyCharges_data, 75) #q1 and a3 assigned
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
MonthlyCharges_iqr = q75 - q25
print('iqr: {}'.format(MonthlyCharges_iqr))

MonthlyCharges_cut_off = MonthlyCharges_iqr * 1.5
MonthlyCharges_lower, MonthlyCharges_upper = q25 - MonthlyCharges_cut_off, q75 + MonthlyCharges_cut_off
print('Cut Off: {}'.format(MonthlyCharges_cut_off))
print('MonthlyCharges Lower Band: {}'.format(MonthlyCharges_lower))
print('MonthlyCharges Upper band: {}'.format(MonthlyCharges_upper))

MonthlyCharges_outliers = [x for x in MonthlyCharges_data if x < MonthlyCharges_lower or x > MonthlyCharges_upper]
print('Feature MonthlyCharges Outliers Total: {}'.format(len(MonthlyCharges_outliers)))
print('MonthlyCharges outliers:{}'.format(MonthlyCharges_outliers))

data = data.drop(data[(data['MonthlyCharges'] > MonthlyCharges_upper) | (data['MonthlyCharges'] < MonthlyCharges_lower)].index)

**Analysis**
- No outliers in MonthlyCharges Column

In [None]:
#Outlier Analysis
#TotalCharges
TotalCharges_data = data['TotalCharges'].values
q25, q75 = np.percentile(TotalCharges_data, 25), np.percentile(TotalCharges_data, 75) #q1 and a3 assigned
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
TotalCharges_iqr = q75 - q25
print('iqr: {}'.format(TotalCharges_iqr))

TotalCharges_cut_off = TotalCharges_iqr * 1.5
TotalCharges_lower, TotalCharges_upper = q25 - TotalCharges_cut_off, q75 + TotalCharges_cut_off
print('Cut Off: {}'.format(TotalCharges_cut_off))
print('TotalCharges Lower Band: {}'.format(TotalCharges_lower))
print('TotalCharges Upper band: {}'.format(TotalCharges_upper))

TotalCharges_outliers = [x for x in TotalCharges_data if x < TotalCharges_lower or x > TotalCharges_upper]
print('Feature TotalCharges Outliers Total: {}'.format(len(TotalCharges_outliers)))
print('TotalCharges outliers:{}'.format(TotalCharges_outliers))

data = data.drop(data[(data['TotalCharges'] > TotalCharges_upper) | (data['TotalCharges'] < TotalCharges_lower)].index)

**Analysis**
- No outliers in TotalCharges Column

# Dummy And Standarize

In [None]:
cat_cols = [col for col in data.columns if data[col].dtype=="object"]
data = pd.get_dummies(data, columns = cat_cols, prefix_sep='_', drop_first = True)

**Normalise numerical columns**

In [None]:
#normalise numerical columns
sc = StandardScaler()
sc.fit(data[['tenure','MonthlyCharges','TotalCharges']])
values_std = sc.transform(data[['tenure','MonthlyCharges','TotalCharges']])
values_std = pd.DataFrame(values_std, columns=['tenure','MonthlyCharges','TotalCharges'])
data.drop(['tenure','MonthlyCharges','TotalCharges'], axis = 1, inplace = True)
data = pd.concat([data, values_std], axis=1)

# Final Dataframe

In [None]:
data.head()

In [None]:
data.columns

# Model Building

In [None]:
sns.countplot('Churn', data=data)
plt.title('Class Distributions \n (0: Ruke hue employees || 1: Chale gye employees)', fontsize=14)

**Analysis**
- I will extract original test values dataframe.
- Before going to random undersampling or oversampling technique, I want to test in the original dataframe whether it is  unbalanced.

In [None]:
print('No Churn: 0', round(data['Churn'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print('Churn: 1', round(data['Churn'].value_counts()[1]/len(data) * 100,2), '% of the dataset')

data.dropna(inplace = True)
X = data.drop('Churn', axis=1)
y = data['Churn']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

In [None]:
original_ytrain.value_counts() #perfectly distributed 73:27 ratio

In [None]:
original_ytest.value_counts()

**Undersampling Technique (Variable name: new_df)**

In [None]:
#Moving on to model building
data = data.sample(frac=1)

# amount of fraud classes 492 rows.
churn_df = data.loc[data['Churn'] == 1]
non_churn_df = data.loc[data['Churn'] == 0][:1866]

normal_distributed_df = pd.concat([churn_df, non_churn_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)
new_df['Churn'] = new_df['Churn'].astype(int)
new_df.head()

In [None]:
corr = new_df.corr()
plt.figure(figsize = (10,10))
ax = sns.heatmap(corr, vmin = -1, center = 0, cmap = 'mako')

In [None]:
X = new_df.drop('Churn', axis=1)
y = new_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [None]:
classifiers = {"DecisionTreeClassifier": DecisionTreeClassifier()}

In [None]:
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

In [None]:
tree_params = {"criterion": ["gini", "entropy"], "splitter": ["best", "random"],
               "max_depth": list(range(2,8,1)), "min_samples_leaf": list(range(5,10,1)),
               "max_features": ['auto', 'sqrt', 'log2']}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)

# tree best estimator
tree_clf = grid_tree.best_estimator_
print('Best Estimators: ', tree_clf)

In [None]:
tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

In [None]:
#Model creation
tree = DecisionTreeClassifier(criterion='entropy', max_depth=6, max_features='auto',
                       min_samples_leaf=5)
tree.fit(X_train, y_train)

In [None]:
y_pred_x = tree.predict(X_test)
print(Fore.RED + "Prone To Overfitting Results:\n")
print(Fore.RED + classification_report(y_test, y_pred_x))

In [None]:
y_pred = tree.predict(original_Xtest)
print(Fore.GREEN + "Accurate Results using Undersampling Technique:\n")
print(Fore.GREEN + classification_report(original_ytest, y_pred))

# Oversampling Technique during Cross Validation

In [None]:
print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))

In [None]:
tree_over = DecisionTreeClassifier()
tree_params = {"criterion": ["gini", "entropy"], "splitter": ["best", "random"],
               "max_depth": list(range(2,8,1)), "min_samples_leaf": list(range(5,10,1)),
               "max_features": ['auto', 'sqrt', 'log2']}
random_grid_tree = RandomizedSearchCV(DecisionTreeClassifier(), tree_params, n_iter=4)

for train_index, test_index in sss.split(original_Xtrain, original_ytrain):
    original_Xtrain1, original_Xtest1 = original_Xtrain.iloc[train_index], original_Xtrain.iloc[test_index]
    original_ytrain1, original_ytest1 = original_ytrain.iloc[train_index], original_ytrain.iloc[test_index]

In [None]:
pipeline_creation = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), random_grid_tree)
model = pipeline_creation.fit(original_Xtrain1, original_ytrain1)
random_grid_tree.best_estimator_

In [None]:
best_est = random_grid_tree.best_estimator_
prediction = best_est.predict(original_Xtest1)
print(Fore.GREEN + "Accurate Results using Oversaampling using SMOTE Technique:\n")
print(Fore.GREEN + classification_report(original_ytest1, prediction))

# Cost Complexity Pruning with Decision Tree

- As alpha increases, more of the tree is pruned, which increses total impurity of the leaves of the DT.

In [None]:
path = best_est.cost_complexity_pruning_path(original_Xtrain1, original_ytrain1)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

**Now, we will train using effective alphas.**
**Note**
- The last value of ccp_alpha. We will remove that, because it will prune the whole tree. So avoiding that value by doing ccp_alphas[-1] from the list

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(max_depth=7, max_features='log2', min_samples_leaf=7, ccp_alpha = ccp_alpha)
    clf.fit(original_Xtrain1, original_ytrain1)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))

**Analysis**
- We will remove the last node of the clfs list. Because that tree consist of only one node.

# Graph: Accuracy vs alpha for training and testing sets

In [None]:
train_scores = [clf.score(original_Xtrain1, original_ytrain1) for clf in clfs]
test_scores = [clf.score(original_Xtest1, original_ytest1) for clf in clfs]

fig, ax = plt.subplots(figsize = (15,10))

ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

# The End