In [105]:
# Importing essential libraries
import pandas as pd
import numpy as np
# Importing Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Importing ML Algorithms 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

Loading both the datasets

In [113]:
train_df=pd.read_csv('customer_churn_dataset-training-master.csv')
test_df=pd.read_csv('customer_churn_dataset-testing-master.csv')

In [114]:
train_df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [115]:
train_df.drop(['CustomerID', 'Last Interaction'], axis=1, inplace=True)


In [116]:
categorical_cols = ['Gender', 'Subscription Type', 'Contract Length']
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)


In [118]:
print(train_df.dtypes)


Age                           float64
Tenure                        float64
Usage Frequency               float64
Support Calls                 float64
Payment Delay                 float64
Total Spend                   float64
Churn                         float64
Gender_Male                      bool
Subscription Type_Premium        bool
Subscription Type_Standard       bool
Contract Length_Monthly          bool
Contract Length_Quarterly        bool
dtype: object


Convert bool columns to int

In [119]:
# Find all boolean columns
bool_cols = train_df.select_dtypes(include='bool').columns

# Convert them to int (True → 1, False → 0)
train_df[bool_cols] = train_df[bool_cols].astype(int)


Split Features and Target Again (Clean)


In [122]:
# Drop rows where Churn is missing
train_df = train_df.dropna(subset=['Churn'])

# Recreate X and y
X = train_df.drop('Churn', axis=1)
y = train_df['Churn']


In [123]:
X = train_df.drop('Churn', axis=1)
y = train_df['Churn']


In [124]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Scale the data

In [125]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Training the model

In [126]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)


Evaluating the model's performance

In [127]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8896185647691313
Confusion Matrix:
 [[34362  3805]
 [ 5927 44073]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      0.90      0.88     38167
         1.0       0.92      0.88      0.90     50000

    accuracy                           0.89     88167
   macro avg       0.89      0.89      0.89     88167
weighted avg       0.89      0.89      0.89     88167



Let's try some other models to compare and decide which model to use for better performance

Training with Random Forest Classifier

In [129]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_rf_pred = rf.predict(X_test)


Evaluating it's performance

In [130]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_rf_pred))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     38167
         1.0       1.00      0.98      0.99     50000

    accuracy                           0.99     88167
   macro avg       0.99      0.99      0.99     88167
weighted avg       0.99      0.99      0.99     88167



Let's save the model for further use

In [131]:
import joblib
joblib.dump(model, 'best_churn_model.pkl')


['best_churn_model.pkl']