In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')
file_path = '--------------------------'
# Load dataset
df = pd.read_csv(file_path)

# First look at data
print("Dataset shape:", df.shape)
display(df.head())

# Data types and null values
print("\nInfo:")
print(df.info())

# Summary statistics
print("\nSummary statistics:")
display(df.describe(include="all"))

# Missing values check
print("\nMissing values per column:")
print(df.isnull().sum())

#numerical variables
num = list(df.select_dtypes(include=['int64','float64']).keys())

#categorical variables
cat = list(df.select_dtypes(include='O').keys())

print("Categorical features: ",cat)
print("Numerical: ",num)

# value_counts of the categorical columns
for i in cat:
    print(df[i].value_counts())

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Remove cx_id
df = df.drop(columns="cx_id")

# Ensure numeric columns are clean
numeric_cols = ['tenure', 'monthly_bill', 'total_bill']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].astype("float")

print("Missing values per column after cleansing:\n")
print(df.isnull().sum())

# Encoding categorical variables
# First convert all values to string and lowercase (to avoid issues)
df['senior_citizen'] = df['senior_citizen'].astype(str).str.lower()

# Replace values
df['senior_citizen'] = df['senior_citizen'].replace({
    'yes': 1, 'y': 1,
    'no': 0,  'n': 0
})

# Finally, ensure integers
df['senior_citizen'] = df['senior_citizen'].astype(int)
df.head()
#categorical variables
cat = list(df.select_dtypes(include='O').keys())
# value_counts of the categorical columns
for i in cat:
    print(df[i].value_counts())

In [None]:
# Check again how many missing values
print("Missing values before filling:\n", df.isnull().sum())

# Fill numeric columns with median
df['tenure'] = df['tenure'].fillna(df['tenure'].mean().round(0))
df['monthly_bill'] = df['monthly_bill'].fillna(df['monthly_bill'].mean().round(2))
df['total_bill'] = df['total_bill'].fillna(df['total_bill'].mean().round(2))

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Check again after filling
print("\nMissing values after filling:\n", df.isnull().sum())


In [None]:
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Prepare features/labels
X = pd.get_dummies(df.drop(columns=['churn']), drop_first=True)
y = df['churn']
# Train/validation split (stratify keeps original class ratio in both sets)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Class distribution BEFORE balancing (train):", Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)
y_train = y_train_bal
print("Class distribution AFTER balancing (train):", Counter(y_train))
df.head()

In [None]:
#Standard libraries for data analysis:----------------------

import numpy as np
from sklearn.preprocessing import StandardScaler


#sklearn modules for Model Selection--------------------------------------

from sklearn import svm, tree, linear_model, neighbors
from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#sklearn modules for Model Evaluation & Improvement---------------------------

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn import metrics

from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import make_scorer, recall_score


In [None]:
# Feature Scaling

sc_X = StandardScaler()
X_train = pd.DataFrame(sc_X.fit_transform(X_train_bal))
X_train.columns = X_train.columns.values
X_train.index = X_train.index.values

X_valid = pd.DataFrame(sc_X.transform(X_valid))
X_valid.columns = X_valid.columns.values
X_valid.index = X_valid.index.values

# Convert y_valid to numerical labels
y_valid = y_valid.apply(lambda x: 1 if x == 'Yes' else 0)


In [None]:
# Check each Model for Train data set
#Compare Baseline Classification Algorithms - First Iteration
#Using Accuracy and ROC AUC Mean Metrics

from sklearn import model_selection

models = []

models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 0,
                                                         class_weight='balanced')))

models.append(('SVC', SVC(kernel = 'linear', random_state = 0)))


models.append(('Kernel SVM', SVC(kernel = 'rbf', random_state = 0)))


models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))


models.append(('Gaussian NB', GaussianNB()))


models.append(('Decision Tree Classifier',
               DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))


models.append(('Random Forest', RandomForestClassifier(
    n_estimators=100, criterion = 'entropy', random_state = 0)))

#Evaluating Model Results:
acc_results = []
auc_results = []
names = []
# set table to table to populate with performance results
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD',
       'Accuracy Mean', 'Accuracy STD']

model_results = pd.DataFrame(columns=col)
i = 0
# evaluate each model using k-fold cross-validation
for name, model in models:
    kfold = model_selection.KFold(
        n_splits=10, random_state=None)  # 10-fold cross-validation

    cv_acc_results = model_selection.cross_val_score(  # accuracy scoring
        model, X_train, y_train, cv=kfold, scoring='accuracy')

    cv_auc_results = model_selection.cross_val_score(  # roc_auc scoring
        model, X_train, y_train, cv=kfold, scoring='roc_auc')

    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

model_results.sort_values(by=['ROC AUC Mean'], ascending=False)

In [None]:
# Check each Model for Test data set
#Logistic Regression

# Fitting Logistic Regression to the Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)


#Evaluate results

acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])



#Support Vector Machine (linear classifier)


# Fitting SVM (SVC class) to the Training set:

classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)
#Evaluate results

acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
results = pd.concat([results,model_results],ignore_index=True)

#K-Nearest Neighbours


# Fitting KNN to the Training set:

classifier = KNeighborsClassifier(n_neighbors = 22, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred  = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)
#Evaluate results
acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

model_results = pd.DataFrame([['K-Nearest Neighbours', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results = pd.concat([results,model_results],ignore_index=True)


#Kernel SVM

# Fitting Kernel SVM to the Training set:

classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)
#Evaluate results

acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

model_results = pd.DataFrame([['Kernel SVM', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results = pd.concat([results,model_results],ignore_index=True)

#Naive Byes

# Fitting Naive Byes to the Training set:

classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)
#Evaluate results
acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

model_results = pd.DataFrame([['Naive Byes', acc, prec, rec, f1, f2]],
                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results = pd.concat([results,model_results],ignore_index=True)


#Decision Tree

# Fitting Decision Tree to the Training set:

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)
#Evaluate results
acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

model_results = pd.DataFrame([['Decision Tree', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results = pd.concat([results,model_results],ignore_index=True)

#Random Forest

# Fitting Random Forest to the Training set:

classifier = RandomForestClassifier(n_estimators = 72, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)
#Evaluate results

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

model_results = pd.DataFrame([['Random Forest', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results = pd.concat([results,model_results],ignore_index=True)

In [None]:
#Train & evaluate Chosen Model

# Fit Logistic Regression on the Training dataset:

classifier = LogisticRegression(random_state = 0, penalty = 'l2')
classifier.fit(X_train, y_train)

# Predict the Test set results

y_pred = classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)

#Evaluate Model Results on Test Set:

acc = accuracy_score(y_valid, y_pred )
prec = precision_score(y_valid, y_pred )
rec = recall_score(y_valid, y_pred )
f1 = f1_score(y_valid, y_pred )
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

print (results)

In [None]:
# Re-check k-Fold Cross Validation:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Logistic Regression Classifier Accuracy: %0.2f (+/- %0.2f)"  % (accuracies.mean(), accuracies.std() * 2))

In [None]:
#Hyper parameter Tuning


# First Iteration:

# Select Regularization Method
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = [0.001, 0.01, 0.1, 1, 1.5, 10, 100]

# Combine Parameters
parameters = dict(C=C, penalty=penalty)

lr_classifier = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = "balanced_accuracy",
                           cv = 10,
                           n_jobs = -1)

lr_classifier  = lr_classifier .fit(X_train, y_train)

lr_best_accuracy = lr_classifier.best_score_
lr_best_parameters = lr_classifier.best_params_
lr_best_accuracy, lr_best_parameters



In [None]:
# Second iteration

# Select Regularization Method
penalty = ['l2']

# Create regularization hyperparameter space
C = [ 0.12, 0.155, 0.16, 0.18, 0.19]

# Combine Parameters
parameters = dict(C=C, penalty=penalty)

lr_classifier = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = "balanced_accuracy",
                           cv = 10,
                           n_jobs = -1)

lr_classifier  = lr_classifier .fit(X_train, y_train)

lr_best_accuracy = lr_classifier.best_score_
lr_best_parameters = lr_classifier.best_params_
lr_best_accuracy, lr_best_parameters


In [None]:
#Final Hyper parameter tuning and selection

lr_classifier = LogisticRegression(penalty = 'l2',C=1)
lr_classifier.fit(X_train, y_train)


# Predict the Test set results

y_pred = lr_classifier.predict(X_valid)
y_pred = pd.Series(y_pred).apply(lambda x: 1 if x == 'Yes' else 0)

#probability score
y_pred_probs = lr_classifier.predict_proba(X_valid)
y_pred_probs  = y_pred_probs [:, 1]

#Evaluate results
acc = accuracy_score(y_valid, y_pred)
prec = precision_score(y_valid, y_pred)
rec = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
f2 = fbeta_score(y_valid, y_pred, beta=2.0)

results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
results = results.sort_values(["Precision", "Recall", "F2 Score"], ascending = False)


print (results)

In [None]:
#Compare predictions against test set
#Revalidate final results with Confusion Matrix:

cm = confusion_matrix(y_valid, y_pred)
print (cm)