Develop a model to predict customer churn for a subscription-based service or business. Use historical customer data, including features like usage behaviour and customer demographics, and try algorithms like Logistic Regression, Random Forests, or Gradient Boosting to predict churn.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
# import Dataset
dataset = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
# Size of Dataset
dataset.shape

(7043, 21)

In [None]:
# Attributes of Dataset
dataset.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [None]:
# Dataset Info
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
# Description of dataset
dataset.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [None]:
# descriptive statastics for categorical features
dataset.describe(include= 'object')

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
count,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043.0,7043
unique,7043,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4,6531.0,2
top,7590-VHVEG,Male,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,,No
freq,1,3555,3641,4933,6361,3390,3096,3498,3088,3095,3473,2810,2785,3875,4171,2365,11.0,5174


In [None]:
dataset['gender'].value_counts()

gender
Male      3555
Female    3488
Name: count, dtype: int64

In [None]:
dataset['Partner'].value_counts()

Partner
No     3641
Yes    3402
Name: count, dtype: int64

In [None]:
dataset['Dependents'].value_counts()

Dependents
No     4933
Yes    2110
Name: count, dtype: int64

In [None]:
dataset['PhoneService'].value_counts()

PhoneService
Yes    6361
No      682
Name: count, dtype: int64

In [None]:
dataset['MultipleLines'].value_counts()

MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64

In [None]:
dataset['InternetService'].value_counts()

InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64

In [None]:
dataset['OnlineSecurity'].value_counts()

OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['OnlineBackup'].value_counts()

OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['DeviceProtection'].value_counts()

DeviceProtection
No                     3095
Yes                    2422
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['TechSupport'].value_counts()

TechSupport
No                     3473
Yes                    2044
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['StreamingTV'].value_counts()

StreamingTV
No                     2810
Yes                    2707
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['StreamingTV'].value_counts()

StreamingTV
No                     2810
Yes                    2707
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['StreamingMovies'].value_counts()

StreamingMovies
No                     2785
Yes                    2732
No internet service    1526
Name: count, dtype: int64

In [None]:
dataset['PaperlessBilling'].value_counts()

PaperlessBilling
Yes    4171
No     2872
Name: count, dtype: int64

In [None]:
dataset['PaymentMethod'].value_counts()

PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64

In [None]:
dataset['Contract'].value_counts()

Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

In [None]:
dataset['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [None]:
# Assign input and Output Varaible
input_features = dataset.iloc [ : , : -1 ]
output_variable = dataset.iloc [ : , -1]

In [None]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_features, output_variable,
                                                    test_size=0.4,
                                                    random_state=5)

In [None]:
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder

# List of columns to label encode have more than two values
columns_to_encode = ['MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                     'StreamingMovies', 'Contract', 'TotalCharges', 'PaymentMethod']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to each column
for column in columns_to_encode:
     data = pd.concat([X_train[column], X_test[column]])
     label_encoder.fit(data)
     X_train[column] = label_encoder.transform(X_train[column])
     X_test[column] = label_encoder.transform(X_test[column])

In [None]:
# Now apply label encoding to the relevant columns in X_train and X_test
binary_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
label_encoder = LabelEncoder()
for column in binary_columns:
    X_train[column] = label_encoder.fit_transform(X_train[column])
    X_test[column] = label_encoder.transform(X_test[column])

In [None]:
print(dataset)

      customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0     7590-VHVEG       0              0        1           0       1   
1     5575-GNVDE       1              0        0           0      34   
2     3668-QPYBK       1              0        0           0       2   
3     7795-CFOCW       1              0        0           0      45   
4     9237-HQITU       0              0        0           0       2   
...          ...     ...            ...      ...         ...     ...   
7038  6840-RESVB       1              0        1           1      24   
7039  2234-XADUH       0              0        1           1      72   
7040  4801-JZAZL       0              0        1           1      11   
7041  8361-LTMKD       1              1        1           0       4   
7042  3186-AJIEK       1              0        0           0      66   

      PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0                0  No phone service             DSL       

In [None]:
# Drop 'customerID' before scaling
X_train = X_train.drop('customerID', axis=1, errors='ignore')
X_test = X_test.drop('customerID', axis=1, errors='ignore')

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Logistic Regression Algorithm

In [None]:
# Suggesting best parameters for Logistic Regression
grid_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
}

grid_search_lr = GridSearchCV(logistic_regression, grid_params, cv=5)
grid_search_lr.fit(X_train, y_train)

print(f"Best parameters for Logistic Regression: {grid_search_lr.best_params_}")


In [None]:
logistic_regression = LogisticRegression(C = 0.01, penalty = 'l2', solver = 'newton-cg')
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = round(accuracy_score(y_test, y_pred_lr) * 100, 2)
print(f"Accuracy of Logistic Regression: {accuracy_lr}%")

Accuracy of Logistic Regression: 79.84%


Random Forest Algorithm

In [None]:
# Suggesting best parameters for Random Forest
grid_params = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
}

grid_search_rf = GridSearchCV(random_forest, grid_params, cv=5)
grid_search_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")

Best parameters for Random Forest: {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}


In [None]:
random_forest = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=10)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = round(accuracy_score(y_test, y_pred_rf) * 100, 2)
print(f"Accuracy of Random Forest: {accuracy_rf}%")

Accuracy of Random Forest: 79.95%


Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Suggesting best parameters for Gradient Boosting
grid_params = {
    'learning_rate': [0.1, 0.2, 0.3, 0.4],
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 15],
}

grid_search_gb = GridSearchCV(GradientBoostingClassifier(), grid_params, cv=5)
grid_search_gb.fit(X_train, y_train)

print(f"Best parameters for Gradient Boosting: {grid_search_gb.best_params_}")

In [None]:
gradient_boost = GradientBoostingClassifier(learning_rate=0.2, n_estimators=100, max_depth=10)
gradient_boost.fit(X_train, y_train)
y_pred_gb = gradient_boost.predict(X_test)
accuracy_gb = round(accuracy_score(y_test, y_pred_gb) * 100, 2)
print(f"Accuracy of Gradient Boosting: {accuracy_gb}%")

Accuracy of Gradient Boosting: 77.68%


Accuracy Score of All Algorithms

In [None]:
# Create a dictionary to store the results
results = {
    "Model": ["Logistic Regression", "Random Forest", "Gradient Boosting"],
    "Accuracy": [accuracy_lr, accuracy_rf, accuracy_gb],
}

# Create a DataFrame
df_results = pd.DataFrame(results)

# Display the DataFrame
print(df_results.to_string())

                 Model  Accuracy
0  Logistic Regression     79.84
1        Random Forest     79.95
2    Gradient Boosting     77.68
