##
Customer Churn analysis and prediction

In [1]:
#'Customer attrition, also known as customer churn, customer turnover, or customer defection, is the loss of clients or customers.
#A high churn means that more customers no longer want to purchase goods and services from the business.

##The primary objective of building a customer churn predictive model is to retain customers at the highest risk of churn by proactively engaging with them.

In [2]:
directory = './'

In [3]:
import pandas as pd
import os
import numpy as np

In [4]:
files = os.listdir(directory)

In [5]:
files

['.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.matplotlib',
 '3D Objects',
 'AppData',
 'Application Data',
 'Banks.csv',
 'Bank_Churn.ipynb',
 'Contacts',
 'Cookies',
 'csv.ipynb',
 'DATA-Copy1.csv',
 'Data_Analysis_Ecomm.ipynb',
 'Desktop',
 'Documents',
 'Downloads',
 'Ecom_Sales.csv',
 'example.csv',
 'excel_in_Py.ipynb',
 'excel_in_python.ipynb',
 'Favorites',
 'IntelGraphicsProfiles',
 'Links',
 'Local Settings',
 'mapIT.ipynb',
 'Microsoft',
 'ML_1.ipynb',
 'Music',
 'mu_code',
 'My Documents',
 'NetHood',
 'NTUSER.DAT',
 'ntuser.dat.LOG1',
 'ntuser.dat.LOG2',
 'NTUSER.DAT{d5cbfabe-f841-11ec-a36f-c84bd614b087}.TM.blf',
 'NTUSER.DAT{d5cbfabe-f841-11ec-a36f-c84bd614b087}.TMContainer00000000000000000001.regtrans-ms',
 'NTUSER.DAT{d5cbfabe-f841-11ec-a36f-c84bd614b087}.TMContainer00000000000000000002.regtrans-ms',
 'ntuser.ini',
 'OneDrive',
 'OneDrive - Reward360 Global Services Pvt Ltd',
 'panda.ipynb',
 'Pictures',
 'PrintHood',
 'python',
 'Recent',
 'Reward360 Global Servic

In [6]:
TL='TL.csv'

In [7]:
df=pd.read_csv(TL)

**Step 1: Data Understanding and Exploration**

In [8]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
# Display basic statistics of the dataset
basic_stats = df.describe(include='all')
basic_stats


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043,7043,7043.0,7043,7043,7043.0,7043,7043,7043,7043,...,7043,7043,7043,7043,7043,7043,7043,7043.0,7043.0,7043
unique,7043,2,,2,2,,2,3,3,3,...,3,3,3,3,3,2,4,,6531.0,2
top,3186-AJIEK,Male,,No,No,,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,,20.2,No
freq,1,3555,,3641,4933,,6361,3390,3096,3498,...,3095,3473,2810,2785,3875,4171,2365,,11.0,5174
mean,,,0.162147,,,32.371149,,,,,...,,,,,,,,64.761692,,
std,,,0.368612,,,24.559481,,,,,...,,,,,,,,30.090047,,
min,,,0.0,,,0.0,,,,,...,,,,,,,,18.25,,
25%,,,0.0,,,9.0,,,,,...,,,,,,,,35.5,,
50%,,,0.0,,,29.0,,,,,...,,,,,,,,70.35,,
75%,,,0.0,,,55.0,,,,,...,,,,,,,,89.85,,


In [10]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values, if any
missing_values[missing_values > 0]


Series([], dtype: int64)

In [11]:
data_types=df.dtypes

In [12]:
data_types

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

**Step 2: Data Preprocessing**

In [13]:
## First I will handle categorical variables. We will use one-hot encodding method (label encoding) to give values to categorical variables in our dta set. 

In [14]:
df['TotalCharges'].unique()

array(['29.85', '1889.5', '108.15', ..., '346.45', '306.6', '6844.5'],
      dtype=object)

In [15]:
empty_strings_count = (df['TotalCharges'] == ' ').sum()
print("Number of empty strings:", empty_strings_count)

Number of empty strings: 11


In [16]:
# Replace empty strings with NaN using .loc
df.loc[df['TotalCharges'] == ' ', 'TotalCharges'] = np.nan

# Convert TotalCharges to float64
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [17]:
## As we have now given correct data types to all the attributes (column), now let us encode the categorical attritbutes/variables for further analysis. 

In [18]:
# Define categorical columns to encode

In [19]:
cat_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
               'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
               'PaperlessBilling', 'PaymentMethod']

In [20]:
dfen = pd.get_dummies(df, columns=cat_columns, drop_first=True)

In [21]:
dfen.head(5)

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,No,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
1,5575-GNVDE,0,34,56.95,1889.5,No,True,False,False,True,...,False,False,False,False,True,False,False,False,False,True
2,3668-QPYBK,0,2,53.85,108.15,Yes,True,False,False,True,...,False,False,False,False,False,False,True,False,False,True
3,7795-CFOCW,0,45,42.3,1840.75,No,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,9237-HQITU,0,2,70.7,151.65,Yes,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False


In [22]:
pd.set_option('display.max_columns', None)

In [23]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [24]:
num_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [25]:
scaler = StandardScaler()

In [26]:
dfen[num_columns] = scaler.fit_transform(dfen[num_columns])

In [27]:
##Creating new feature (attribute) for average monthly spend

In [28]:
dfen['AverageMonthlySpend'] = dfen['TotalCharges'] / dfen['tenure']

In [29]:
dfen['Churn'] = dfen['Churn'].replace({'Yes': 1, 'No': 0})

# Check the unique values in 'Churn' after encoding
print(dfen['Churn'].unique())

[0 1]


  dfen['Churn'] = dfen['Churn'].replace({'Yes': 1, 'No': 0})


In [30]:
print(dfen.dtypes)

customerID                                object
SeniorCitizen                            float64
tenure                                   float64
MonthlyCharges                           float64
TotalCharges                             float64
Churn                                      int64
gender_Male                                 bool
Partner_Yes                                 bool
Dependents_Yes                              bool
PhoneService_Yes                            bool
MultipleLines_No phone service              bool
MultipleLines_Yes                           bool
InternetService_Fiber optic                 bool
InternetService_No                          bool
OnlineSecurity_No internet service          bool
OnlineSecurity_Yes                          bool
OnlineBackup_No internet service            bool
OnlineBackup_Yes                            bool
DeviceProtection_No internet service        bool
DeviceProtection_Yes                        bool
TechSupport_No inter

In [31]:
# Select numeric columns
numeric_columns = dfen.select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation coefficients with Churn for numeric columns
correlations = dfen[numeric_columns].corr()['Churn'].abs().sort_values(ascending=False)

# Display correlation coefficients
print("Correlation Coefficients with Churn for Numeric Columns:")
print(correlations)

Correlation Coefficients with Churn for Numeric Columns:
Churn                  1.000000
tenure                 0.352229
TotalCharges           0.199484
MonthlyCharges         0.193356
SeniorCitizen          0.150889
AverageMonthlySpend    0.003813
Name: Churn, dtype: float64


In [32]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd

In [33]:
X = dfen.drop(columns=['Churn'])
y = dfen['Churn']

In [34]:
dfen = dfen.drop(columns=['customerID'])

In [35]:
print(dfen.dtypes)

SeniorCitizen                            float64
tenure                                   float64
MonthlyCharges                           float64
TotalCharges                             float64
Churn                                      int64
gender_Male                                 bool
Partner_Yes                                 bool
Dependents_Yes                              bool
PhoneService_Yes                            bool
MultipleLines_No phone service              bool
MultipleLines_Yes                           bool
InternetService_Fiber optic                 bool
InternetService_No                          bool
OnlineSecurity_No internet service          bool
OnlineSecurity_Yes                          bool
OnlineBackup_No internet service            bool
OnlineBackup_Yes                            bool
DeviceProtection_No internet service        bool
DeviceProtection_Yes                        bool
TechSupport_No internet service             bool
TechSupport_Yes     

In [37]:
for column in dfen.columns:
    dfen[column] = dfen[column].apply(pd.to_numeric, errors='coerce')

In [38]:
print(dfen.isnull().sum())

SeniorCitizen                             0
tenure                                    0
MonthlyCharges                            0
TotalCharges                             11
Churn                                     0
gender_Male                               0
Partner_Yes                               0
Dependents_Yes                            0
PhoneService_Yes                          0
MultipleLines_No phone service            0
MultipleLines_Yes                         0
InternetService_Fiber optic               0
InternetService_No                        0
OnlineSecurity_No internet service        0
OnlineSecurity_Yes                        0
OnlineBackup_No internet service          0
OnlineBackup_Yes                          0
DeviceProtection_No internet service      0
DeviceProtection_Yes                      0
TechSupport_No internet service           0
TechSupport_Yes                           0
StreamingTV_No internet service           0
StreamingTV_Yes                 

In [39]:
# Impute missing values with the mean of the column
dfen['TotalCharges'].fillna(dfen['TotalCharges'].mean(), inplace=True)
dfen['AverageMonthlySpend'].fillna(dfen['AverageMonthlySpend'].mean(), inplace=True)

# Verify there are no more missing values
print(dfen.isnull().sum())

SeniorCitizen                            0
tenure                                   0
MonthlyCharges                           0
TotalCharges                             0
Churn                                    0
gender_Male                              0
Partner_Yes                              0
Dependents_Yes                           0
PhoneService_Yes                         0
MultipleLines_No phone service           0
MultipleLines_Yes                        0
InternetService_Fiber optic              0
InternetService_No                       0
OnlineSecurity_No internet service       0
OnlineSecurity_Yes                       0
OnlineBackup_No internet service         0
OnlineBackup_Yes                         0
DeviceProtection_No internet service     0
DeviceProtection_Yes                     0
TechSupport_No internet service          0
TechSupport_Yes                          0
StreamingTV_No internet service          0
StreamingTV_Yes                          0
StreamingMo

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfen['TotalCharges'].fillna(dfen['TotalCharges'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfen['AverageMonthlySpend'].fillna(dfen['AverageMonthlySpend'].mean(), inplace=True)


In [40]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X = dfen.drop(columns=['Churn'])
y = dfen['Churn']

# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")

Training data shape: (5634, 31)
Testing data shape: (1409, 31)
Training labels shape: (5634,)
Testing labels shape: (1409,)


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.8226
Precision: 0.6892
Recall: 0.6005
F1 Score: 0.6418


In [42]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"Random Forest - Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest - Precision: {precision_rf:.4f}")
print(f"Random Forest - Recall: {recall_rf:.4f}")
print(f"Random Forest - F1 Score: {f1_rf:.4f}")


Random Forest - Accuracy: 0.7970
Random Forest - Precision: 0.6667
Random Forest - Recall: 0.4665
Random Forest - F1 Score: 0.5489


In [43]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)

print(f"Gradient Boosting - Accuracy: {accuracy_gb:.4f}")
print(f"Gradient Boosting - Precision: {precision_gb:.4f}")
print(f"Gradient Boosting - Recall: {recall_gb:.4f}")
print(f"Gradient Boosting - F1 Score: {f1_gb:.4f}")


Gradient Boosting - Accuracy: 0.8091
Gradient Boosting - Precision: 0.6722
Gradient Boosting - Recall: 0.5442
Gradient Boosting - F1 Score: 0.6015


In [44]:
predictions = logreg.predict(dfen.drop(columns=['Churn']))

# Add the predictions as a new column in your original DataFrame
dfen['Predicted_Churn'] = predictions

# Display the customers predicted to churn
churn_predictions = dfen[dfen['Predicted_Churn'] == 1]
print(churn_predictions)

      SeniorCitizen    tenure  MonthlyCharges  TotalCharges  Churn  \
0         -0.439916 -1.277445       -1.160323     -0.994194      0   
4         -0.439916 -1.236724        0.197365     -0.940457      1   
5         -0.439916 -0.992402        1.159546     -0.645369      1   
8         -0.439916 -0.177995        1.330711      0.336516      1   
19        -0.439916 -0.463037        0.840481     -0.185475      0   
...             ...       ...             ...           ...    ...   
7025      -0.439916 -0.585198        1.006660     -0.266433      0   
7029       2.273159 -1.073843       -0.676740     -0.891309      0   
7032       2.273159 -1.277445        0.365207     -0.973944      1   
7035      -0.439916 -0.544478        0.463253     -0.347744      0   
7041       2.273159 -1.155283        0.320338     -0.872095      1   

      gender_Male  Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0           False         True           False             False   
4           False      