In [130]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDOneClassSVM
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import silhouette_score,silhouette_samples
from yellowbrick.cluster import SilhouetteVisualizer
import pickle
import json
import warnings
warnings.filterwarnings('ignore')

In [131]:
df = pd.read_csv('telco_data_clean.csv', index_col=False)
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [132]:
print(f"tau-correlation: {df['gender'].corr(df['churn'], method='kendall'):.2f}")

tau-correlation: -0.01


In [133]:
for x in df.columns:
  print(f"{x} tau-correlation: {df[x].corr(df['churn'], method='kendall'):.2f}")

gender tau-correlation: -0.01
senior_citizen tau-correlation: 0.15
partner tau-correlation: -0.15
dependents tau-correlation: -0.16
tenure tau-correlation: -0.31
phone_service tau-correlation: 0.01
multiple_lines tau-correlation: 0.04
internet_service tau-correlation: -0.03
online_security tau-correlation: -0.29
online_backup tau-correlation: -0.19
device_protection tau-correlation: -0.18
tech_support tau-correlation: -0.28
streaming_tv tau-correlation: -0.04
streaming_movies tau-correlation: -0.04
contract tau-correlation: -0.39
paperless_billing tau-correlation: 0.19
payment_method tau-correlation: 0.09
monthly_charges tau-correlation: 0.15
total_charges tau-correlation: -0.19
churn tau-correlation: 1.00


In [134]:
columns = ['senior_citizen','partner','dependents','tenure','online_security',
                'online_backup','device_protection','tech_support','contract',
                'paperless_billing','monthly_charges','total_charges','churn']

In [135]:
df = df.drop(columns=[col for col in df if col not in columns])

In [136]:
df.columns

Index(['senior_citizen', 'partner', 'dependents', 'tenure', 'online_security',
       'online_backup', 'device_protection', 'tech_support', 'contract',
       'paperless_billing', 'monthly_charges', 'total_charges', 'churn'],
      dtype='object')

In [137]:
numCol = ['tenure',	'monthly_charges',	'total_charges']
catCol = ['senior_citizen', 'partner', 'dependents', 'online_security',
       'online_backup', 'device_protection', 'tech_support', 'contract',
       'paperless_billing']

In [138]:
X = df.drop('churn', axis =  1)
y = df['churn']

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

print('Train Size :',X_train.shape)

print('Test Size :',X_test.shape)

Train Size : (5625, 12)
Test Size : (1407, 12)


In [140]:
sc = StandardScaler()
sc = sc.fit(X_train[numCol])
X_train_num = sc.transform(X_train[numCol])
X_test_num = sc.transform(X_test[numCol])

In [141]:
ohe = OneHotEncoder()
ohe = ohe.fit(X_train[catCol])
X_train_cat = ohe.transform(X_train[catCol]).toarray()
X_test_cat = ohe.transform(X_test[catCol]).toarray()

In [142]:
X_train_final = np.concatenate([X_train_num, X_train_cat], axis=1)
X_test_final = np.concatenate([X_test_num, X_test_cat], axis=1)

In [159]:
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_final, y_train)
y_train_balanced.value_counts()


No     1495
Yes    1495
Name: churn, dtype: int64

In [160]:
logreg = LogisticRegression()
logreg.fit(X_train_balanced, y_train_balanced)

In [161]:
knn = KNeighborsClassifier()
knn.fit(X_train_balanced, y_train_balanced)

In [162]:
svm = SVC()
svm.fit(X_train_balanced, y_train_balanced)

In [163]:
dt = DecisionTreeClassifier()
dt.fit(X_train_balanced, y_train_balanced)

In [164]:
rf = RandomForestClassifier()
rf.fit(X_train_balanced, y_train_balanced)

In [165]:
def performance_check(clf, X, y):
  y_pred = clf.predict(X)

  return precision_score(y, y_pred, pos_label='Yes')

In [166]:
y_pred_train_logreg = logreg.predict(X_train_balanced)
y_pred_test_logreg = logreg.predict(X_test_final)
y_pred_train_logreg
print('Precision - Train : ', performance_check(logreg, X_train_balanced, y_train_balanced))
print('Precision - Test  : ', performance_check(logreg, X_test_final, y_test))

Precision - Train :  0.7390499691548427
Precision - Test  :  0.5042158516020236


In [167]:
y_pred_train_knn = knn.predict(X_train_balanced)
y_pred_test_knn = knn.predict(X_test_final)
y_pred_train_knn
print('Precision - Train : ', performance_check(knn, X_train_balanced, y_train_balanced))
print('Precision - Test  : ', performance_check(knn, X_test_final, y_test))

Precision - Train :  0.7937219730941704
Precision - Test  :  0.47157190635451507


In [168]:
y_pred_train_svm = svm.predict(X_train_balanced)
y_pred_test_svm = svm.predict(X_test_final)
y_pred_train_svm
print('Precision - Train : ', performance_check(svm, X_train_balanced, y_train_balanced))
print('Precision - Test  : ', performance_check(svm, X_test_final, y_test))

Precision - Train :  0.7423312883435583
Precision - Test  :  0.49745331069609505


In [169]:
y_pred_train_dt = dt.predict(X_train_balanced)
y_pred_test_dt = dt.predict(X_test_final)
y_pred_train_dt
print('Precision - Train : ', performance_check(dt, X_train_balanced, y_train_balanced))
print('Precision - Test  : ', performance_check(dt, X_test_final, y_test))

Precision - Train :  0.9993270524899058
Precision - Test  :  0.4249146757679181


In [170]:
y_pred_train_rf = rf.predict(X_train_balanced)
y_pred_test_rf = rf.predict(X_test_final)
y_pred_train_rf
print('Precision - Train : ', performance_check(rf, X_train_balanced, y_train_balanced))
print('Precision - Test  : ', performance_check(rf, X_test_final, y_test))

Precision - Train :  0.9953271028037384
Precision - Test  :  0.4936708860759494
