In [1]:
import kagglehub
blastchar_telco_customer_churn_path = kagglehub.dataset_download('blastchar/telco-customer-churn')

print('Data source import complete.')

Data source import complete.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

# **Customer Churn Analysis**
**Goal:** \

Predict customer churn and identify key factors influencing it using the Telco Customer Churn dataset.

**Focus Areas:**
- Build and evaluate machine learning models (Logistic Regression, Random Forest)
- Identify top predictors contributing to customer churn
- Integrate preprocessing and modeling into a reusable pipeline
- Deploy the model for real-time churn prediction

**This notebook covers:**
- Data cleaning and feature encoding
- Train-test split and model training
- Model evaluation (accuracy, ROC-AUC, classification report)
- Feature importance and insights
- Exporting the pipeline for deployment


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Loading and Inspecting the data**

In [4]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.shape

(7043, 21)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


# **Prepare Dataset for Modeling**

In [9]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

# Convert churn column to 1/0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Drop ID column
df.drop('customerID', axis=1, inplace=True)

# Encode categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)


# **Split Dataset**

In [10]:
# Split features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **Training the Models**

In [12]:
# Train logistic regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)
log_report = classification_report(y_test, y_pred_log, output_dict=True)

In [13]:
# Train random forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)
rf_probs = rf_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, rf_probs)

In [14]:
# Feature importance from random forest
top_features = pd.Series(rf_model.feature_importances_, index=X.columns).nlargest(10)

# ROC Curve data
fpr, tpr, _ = roc_curve(y_test, rf_probs)

# **Evaluation**

**Logistic Regression Evaluation**

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1 Score:", f1_score(y_test, y_pred_log))
print("AUC:", roc_auc_score(y_test, y_pred_log))
print()
print("Classification Report:\n", classification_report(y_test, y_pred_log))
print()

Accuracy: 0.7874911158493249
Precision: 0.6205787781350482
Recall: 0.516042780748663
F1 Score: 0.5635036496350365
AUC: 0.7009061919232185

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.62      0.52      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407




**Random Forest Evaluation**

In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_pred_rf))
print()
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print()

Accuracy: 0.7853589196872779
Precision: 0.6267605633802817
Recall: 0.47593582887700536
F1 Score: 0.541033434650456
AUC: 0.6866610412536043

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.63      0.48      0.54       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.77      0.79      0.78      1407




In [17]:
# Return summary stats
{
  "Logistic_Accuracy": round(log_report["accuracy"], 3),
  "RandomForest_Accuracy": round(rf_report["accuracy"], 3),
  "RandomForest_ROC_AUC": round(roc_auc, 3),
  "Top_Features": top_features.to_dict()
}

{'Logistic_Accuracy': 0.787,
 'RandomForest_Accuracy': 0.785,
 'RandomForest_ROC_AUC': np.float64(0.816),
 'Top_Features': {'TotalCharges': 0.19340911714335177,
  'MonthlyCharges': 0.16975767540826495,
  'tenure': 0.1675723103733715,
  'InternetService_Fiber optic': 0.03999937273506037,
  'PaymentMethod_Electronic check': 0.035016248084857156,
  'OnlineSecurity_Yes': 0.028904747347820576,
  'Contract_Two year': 0.02861778697194593,
  'gender_Male': 0.026971342552233213,
  'TechSupport_Yes': 0.025828753202295322,
  'PaperlessBilling_Yes': 0.025044133041636198}}

In [18]:
summary_stats = {
    "Metric": [
        "Logistic Regression Accuracy",
        "Random Forest Accuracy",
        "Random Forest ROC AUC"
    ],
    "Value": [
        round(log_report["accuracy"], 3),
        round(rf_report["accuracy"], 3),
        round(roc_auc, 3)
    ]
}

summary_df = pd.DataFrame(summary_stats)

# Format top features into a separate DataFrame
top_features_df = top_features.reset_index()
top_features_df.columns = ['Feature', 'Importance']

In [19]:
summary_df

Unnamed: 0,Metric,Value
0,Logistic Regression Accuracy,0.787
1,Random Forest Accuracy,0.785
2,Random Forest ROC AUC,0.816


In [20]:
top_features_df

Unnamed: 0,Feature,Importance
0,TotalCharges,0.193409
1,MonthlyCharges,0.169758
2,tenure,0.167572
3,InternetService_Fiber optic,0.039999
4,PaymentMethod_Electronic check,0.035016
5,OnlineSecurity_Yes,0.028905
6,Contract_Two year,0.028618
7,gender_Male,0.026971
8,TechSupport_Yes,0.025829
9,PaperlessBilling_Yes,0.025044


In [21]:
if log_report["accuracy"] > rf_report["accuracy"]:
    best_model = "Logistic Regression"
    best_mod = log_model
    best_accuracy = log_report["accuracy"]
else:
    best_model = "Random Forest"
    best_accuracy = rf_report["accuracy"]
    best_mod = rf_model

print(f"✅ Best Model: {best_model} with Accuracy: {best_accuracy:.3f}")

✅ Best Model: Logistic Regression with Accuracy: 0.787


In [23]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(best_mod, file)