# Customer Churn Prediction

## Dataset

Telecom Customer Churn Dataset

## Objective 

Predict whether a customer is likely to churn (cancel their subscription) based on features such as usage patterns, demographics, and customer service interactions.
    
    

## Techniques

- Binary classification
- Feature engineering
- Model evaluation
    - Accuracy
    - Precision
    - Recall
    - F1-score
    - ROC-AUC
- Churn analysis
- Customer segmentation.

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/TelecomCustomerChurn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
y = df['Churn'].map({'Yes':1, 'No':0})
X = df.drop(columns=('Churn'))
X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
X['SeniorCitizen'] = X['SeniorCitizen'].map({1: 'Yes', 0: 'No'})

X.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,7590-VHVEG,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,7795-CFOCW,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [4]:
X = X.drop(columns=('customerID'))
print(X.head())
print(np.unique(X['SeniorCitizen']))


   gender SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female            No     Yes         No       1           No   
1    Male            No      No         No      34          Yes   
2    Male            No      No         No       2          Yes   
3    Male            No      No         No      45           No   
4  Female            No      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1              Yes          No        

In [5]:
print(X.isnull().sum())

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Get numerical and categorical columns
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns

categorical_features = X.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

# Transform X using the preprocessor
X_preprocessed = preprocessor.fit_transform(X)

# Get the transformed categorical column names after one-hot encoding
categorical_features_encoded = preprocessor.named_transformers_['categorical'].named_steps['encoder'].get_feature_names_out(input_features=categorical_features)

# Combine the numerical and transformed categorical column names
all_cols = numerical_features.tolist() + categorical_features_encoded.tolist()

# Convert the transformed array back to a DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_cols)


X_preprocessed_df.head()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.994971,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.173876,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.36266,-0.960399,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.1954,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.941193,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [7]:
from sklearn.model_selection import train_test_split

# Split data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_df, y, test_size=0.2, random_state=1, stratify=y)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

classifiers = {
    'Logistic Regression':LogisticRegression(max_iter=10000),
    'Decision Tree':DecisionTreeClassifier(criterion='entropy'),
    'KNN':KNeighborsClassifier(),
    'Random Forest':RandomForestClassifier(n_jobs=-1),
    'SVC':SVC(probability=True),
    'Gaussian Naive Bayes':GaussianNB(),
    'XGBoost':xgb.XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=4, random_state=1, use_label_encoder=False),
}

param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l2']},
    'Decision Tree': {'max_depth': [3, 4, 5, 6]},
    'KNN': {'n_neighbors': [3, 5, 7, 8, 9, 10, 11], 'p': [1, 2]},
    'Random Forest': {'n_estimators': [100, 500, 1000]},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    'Gaussian Naive Bayes': {'priors': [None, [0.5, 0.5]], 'var_smoothing': [1e-9, 1e-8, 1e-7]},
    'XGBoost': {'n_estimators': [100, 500, 1000], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [1,2,3, 4, 5]},      
}


In [11]:
from sklearn.model_selection import GridSearchCV
# Perform grid search for each classifier
best_estimators = {}
for clf_label, clf in classifiers.items():
    print(f"Grid search for {clf_label}...")
    grid_search = GridSearchCV(clf, param_grids[clf_label], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_estimators[clf_label] = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}\n")

Grid search for Logistic Regression...
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation accuracy: 0.80

Grid search for Decision Tree...
Best parameters: {'max_depth': 4}
Best cross-validation accuracy: 0.79

Grid search for KNN...




Best parameters: {'n_neighbors': 10, 'p': 2}
Best cross-validation accuracy: 0.79

Grid search for Random Forest...
Best parameters: {'n_estimators': 500}
Best cross-validation accuracy: 0.79

Grid search for SVC...
Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation accuracy: 0.80

Grid search for Gaussian Naive Bayes...
Best parameters: {'priors': None, 'var_smoothing': 1e-09}
Best cross-validation accuracy: 0.66

Grid search for XGBoost...
Best parameters: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 1000}
Best cross-validation accuracy: 0.81



In [None]:
from sklearn.ensemble import VotingClassifier
# Majority Voting
classifiers = [('lr',LogisticRegression(max_iter=10000)),('dt',DecisionTreeClassifier(criterion='entropy')),('knn',KNeighborsClassifier()),('svc',SVC(probability=True)),('gnb',GaussianNB())]
mv_clf = VotingClassifier(estimators=classifiers, voting='soft')
param_grids = {
    'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'lr__penalty': ['l2'],
    'dt__max_depth': [3, 4, 5, 6],
    'knn__n_neighbors': [3, 5, 7],
    'knn__p': [1, 2],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.1, 1, 10],
    'gnb__priors':[None, [0.5, 0.5]],
    'gnb__var_smoothing':[1e-9, 1e-8, 1e-7],
}

# Perform grid search
grid_search = GridSearchCV(mv_clf, param_grids, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_