#### Predicting Customer Churn in the Telecom Industry: A Data Science Approach with Python

##### 1. Import the necessary libraries

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

##### 2. Load the dataset

In [20]:
df=pd.read_csv('./telco-customer-churn.csv')

##### 3. Data Exploration

In [21]:
#Display the first few rows
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [22]:
# Get information about the data types and missing values
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [23]:
# Summary statistics
print(df.describe())

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


##### 4. Data pre-processing

In [24]:
# Handling missing values (if any)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                    'PaperlessBilling', 'PaymentMethod']

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Normalize numerical features
scaler = StandardScaler()
df[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])

# Splitting data into features (X) and target (y)
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']
y = le.fit_transform(y)  # Encode the target variable

##### 5. Split the Data into Training and Testing Sets

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Model Selection: Exploring Different Approaches

In [26]:
# 1. K-Nearest Neighbors(KNN
# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [27]:
# 2. Naive Bayes
# Train Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

In [28]:
# 3. Random Forest
# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

##### Model Evaluation: Measuring Performance

##### 1. Evaluation Function

In [29]:
# Custom Function to evaluate the models
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

##### 2. Evaluate the models

In [30]:
# Evaluate KNN
knn_results = evaluate_model(y_test, y_pred_knn)

# Evaluate Naive Bayes
nb_results = evaluate_model(y_test, y_pred_nb)

# Evaluate Random Forest
rf_results = evaluate_model(y_test, y_pred_rf)

# Compile results
results = {
    'Model': ['KNN', 'Naive Bayes', 'Random Forest'],
    'Accuracy': [knn_results[0], nb_results[0], rf_results[0]],
    'Precision': [knn_results[1], nb_results[1], rf_results[1]],
    'Recall': [knn_results[2], nb_results[2], rf_results[2]],
    'F1-Score': [knn_results[3], nb_results[3], rf_results[3]]
}

results_df = pd.DataFrame(results)
print(results_df)

           Model  Accuracy  Precision    Recall  F1-Score
0            KNN  0.758351   0.547753  0.521390  0.534247
1    Naive Bayes  0.737740   0.504638  0.727273  0.595838
2  Random Forest  0.788202   0.634752  0.478610  0.545732


##### Model Performance Summary:
- K-Nearest Neighbors (KNN): KNN provides decent performance, but it struggles with precision and recall, indicating that it might not be the best model for this dataset.

- Naive Bayes: This model excels in recall, meaning it captures most of the churn cases, but at the cost of lower precision, leading to more false positives.

- Random Forest: Random Forest stands out with the highest accuracy and precision, making it the most reliable model for predicting customer churn in this case study.

##### Conclusion: Random Forest is the Champion

After evaluating the models, Random Forest emerges as the best choice for predicting customer churn. It offers the highest accuracy and precision, making it a robust tool for identifying customers at risk of churning. Telecom companies can leverage this model to craft targeted retention strategies, thereby reducing churn and increasing customer loyalty.