In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('churn.csv')

In [3]:
df

Unnamed: 0,churn,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,No,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1
1,No,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1
2,No,137,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0
3,No,84,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,No,75,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,No,50,no,yes,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2
4996,Yes,152,no,no,0,184.2,90,31.31,256.8,73,21.83,213.6,113,9.61,14.7,2,3.97,3
4997,No,61,no,no,0,140.6,89,23.90,172.8,128,14.69,212.4,97,9.56,13.6,4,3.67,1
4998,No,109,no,no,0,188.8,67,32.10,171.7,92,14.59,224.4,89,10.10,8.5,6,2.30,0


Only 'Internationplan' and 'Voicemailplan' are categorical with 2 distinct values

All columns are usefull, noone can be rejected

In [4]:
df.columns

Index(['churn', 'accountlength', 'internationalplan', 'voicemailplan',
       'numbervmailmessages', 'totaldayminutes', 'totaldaycalls',
       'totaldaycharge', 'totaleveminutes', 'totalevecalls', 'totalevecharge',
       'totalnightminutes', 'totalnightcalls', 'totalnightcharge',
       'totalintlminutes', 'totalintlcalls', 'totalintlcharge',
       'numbercustomerservicecalls'],
      dtype='object')

In [5]:
Y=df.churn
X=df.loc[:,'accountlength':'numbercustomerservicecalls']

No null values in the dataset

In [6]:
df.isnull().sum()

Standardising the data

In [7]:
categorical_columns = ['internationalplan', 'voicemailplan']
numerical_columns = [col for col in df.columns if col not in categorical_columns and col != 'churn']

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [9]:
df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})
df['internationalplan'] = df['internationalplan'].map({'yes': 1, 'no': 0})
df['voicemailplan'] = df['voicemailplan'].map({'yes': 1, 'no': 0})

In [10]:
df

Unnamed: 0,churn,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,0,0.698941,0,1,1.273145,1.573802,0.502824,1.574074,-0.064032,-0.060077,-0.063849,0.876999,-0.446928,0.876286,-0.094809,-0.584236,-0.095509,-0.436676
1,0,0.169849,0,1,1.346973,-0.346802,1.158422,-0.347082,-0.101621,0.141693,-0.101089,1.068992,0.154374,1.069818,1.245227,-0.584236,1.245982,-0.436676
2,0,0.925695,0,0,-0.572549,1.171125,0.704546,1.171286,-1.571562,0.494791,-1.572084,-0.748012,0.204483,-0.746737,0.701969,0.229917,0.695971,-1.202236
3,0,-0.409634,1,0,-0.572549,2.210292,-1.463971,2.210457,-2.744745,-0.614946,-2.745155,-0.069110,-0.547145,-0.069377,-1.326194,1.044069,-1.329681,0.328885
4,0,-0.636388,1,0,-0.572549,-0.252163,0.654116,-0.252115,-1.035419,1.100103,-1.034426,-0.267041,1.056327,-0.267307,-0.058592,-0.584236,-0.055264,1.094445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,-1.266260,0,1,2.380562,1.028239,1.360145,1.028291,0.442435,1.301873,0.443551,1.922073,0.805785,1.923114,-0.131027,0.229917,-0.135754,0.328885
4996,1,1.303619,0,0,-0.572549,0.072577,-0.505789,0.072080,1.111130,-1.371585,1.111550,0.261434,0.655459,0.260505,1.607399,-0.991312,1.608185,1.094445
4997,0,-0.989116,0,0,-0.572549,-0.736489,-0.556220,-0.736771,-0.550715,1.402758,-0.550301,0.237683,-0.146277,0.238513,1.209010,-0.177160,1.205738,-0.436676
4998,0,0.220239,0,0,-0.572549,0.157937,-1.665694,0.158313,-0.572477,-0.413175,-0.573577,0.475200,-0.547145,0.476029,-0.638067,0.636993,-0.632105,-1.202236


Splitting the data into train and test set

In [11]:
from sklearn.model_selection import train_test_split

X = df.drop('churn', axis=1)
y = df['churn']

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4000, 17)
X_test shape: (1000, 17)
y_train shape: (4000,)
y_test shape: (1000,)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have already split the data into X_train, X_test, y_train, and y_test

# Initialize the logistic regression model
logreg = LogisticRegression()

# Train the model on the training data
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.872
Precision: 0.5964912280701754
Recall: 0.2446043165467626
F1 Score: 0.346938775510204
ROC-AUC Score: 0.6089455961363314


Random Forest Algorithm

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have already split the data into X_train, X_test, y_train, and y_test

# Initialize the Random Forest model
random_forest = RandomForestClassifier(random_state=42)

# Train the model on the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.96
Precision: 0.9304347826086956
Recall: 0.7697841726618705
F1 Score: 0.8425196850393699
ROC-AUC Score: 0.8802463255876136


Using KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have already split the data into X_train, X_test, y_train, and y_test

# Initialize the KNN model with k=5 (you can choose a different value for k)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.898
Precision: 0.8627450980392157
Recall: 0.31654676258992803
F1 Score: 0.4631578947368421
ROC-AUC Score: 0.6542083406445575


Using SVM

In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have already split the data into X_train, X_test, y_train, and y_test

# Initialize the SVM model
svm = SVC(kernel='rbf', random_state=42)

# Train the model on the training data
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.92
Precision: 0.9154929577464789
Recall: 0.4676258992805755
F1 Score: 0.6190476190476191
ROC-AUC Score: 0.7303286290827964


Bagging Decision trees

In [29]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have already split the data into X_train, X_test, y_train, and y_test

# Initialize the Decision Tree classifier as the base estimator
base_estimator = DecisionTreeClassifier(random_state=42)

# Initialize the BaggingClassifier with Decision Trees
bagging_classifier = BaggingClassifier(estimator=base_estimator, n_estimators=300, random_state=42)

# Train the model on the training data
bagging_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.955
Precision: 0.9196428571428571
Recall: 0.7410071942446043
F1 Score: 0.8207171314741035
ROC-AUC Score: 0.8652771162860651


Boosted Decision Trees

In [27]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have already split the data into X_train, X_test, y_train, and y_test

# Initialize the Decision Tree classifier as the base estimator
base_estimator = DecisionTreeClassifier(random_state=42)

# Initialize the AdaBoostClassifier with Decision Trees
adaboost_classifier = AdaBoostClassifier(estimator=base_estimator, n_estimators=300, random_state=42)

# Train the model on the training data
adaboost_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = adaboost_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.93
Precision: 0.7593984962406015
Recall: 0.7266187050359713
F1 Score: 0.7426470588235294
ROC-AUC Score: 0.8447263095446987


The Best accuracy we got is : 96 %