In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../data/telco_churn.csv')

# Quick check
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
# Make a copy for preprocessing
df1 = df.copy()

# Now drop 'customerID' from the copy, not the original
df1.drop('customerID', axis=1, inplace=True)


In [3]:
df1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df1['TotalCharges'] = pd.to_numeric(df1['TotalCharges'], errors='coerce')

# Drop rows with missing TotalCharges
df1.dropna(subset=['TotalCharges'], inplace=True)

# Reset index after drop
df1.reset_index(drop=True, inplace=True)


In [5]:
df1['Churn'] = df1['Churn'].map({'Yes': 1, 'No': 0})


In [6]:
df1['charge_ratio'] = df1['MonthlyCharges'] / df1['TotalCharges']


In [7]:
df1 = pd.get_dummies(df1, drop_first=True)


In [8]:
print("Final shape:", df1.shape)
df1.head()


Final shape: (7032, 32)


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,charge_ratio,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1.0,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,0.03014,True,False,False,True,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,0.49792,True,False,False,True,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,0.02298,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,0.466205,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False


In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 32 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7032 non-null   int64  
 1   tenure                                 7032 non-null   int64  
 2   MonthlyCharges                         7032 non-null   float64
 3   TotalCharges                           7032 non-null   float64
 4   Churn                                  7032 non-null   int64  
 5   charge_ratio                           7032 non-null   float64
 6   gender_Male                            7032 non-null   bool   
 7   Partner_Yes                            7032 non-null   bool   
 8   Dependents_Yes                         7032 non-null   bool   
 9   PhoneService_Yes                       7032 non-null   bool   
 10  MultipleLines_No phone service         7032 non-null   bool   
 11  Mult

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [11]:
X = df1.drop('Churn', axis=1)
y = df1['Churn']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [15]:
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)



In [16]:
y_pred = model.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7889125799573561
Confusion Matrix:
 [[925 108]
 [189 185]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.63      0.49      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [17]:
from sklearn.linear_model import LogisticRegression

# Create the model
log_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')
log_model.fit(X_train, y_train)

# Predictions
y_pred_log = log_model.predict(X_test)

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))


Accuracy: 0.7313432835820896
Confusion Matrix:
 [[725 308]
 [ 70 304]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.70      0.79      1033
           1       0.50      0.81      0.62       374

    accuracy                           0.73      1407
   macro avg       0.70      0.76      0.70      1407
weighted avg       0.80      0.73      0.75      1407



### 🔄 Model Comparison: Random Forest (Balanced) vs Logistic Regression (Balanced)

To improve recall and credibility for churn prediction, two models were trained and evaluated on the processed dataset. Here's a detailed comparison:

---

#### 🧪 Evaluation Metrics

| Metric         | Random Forest (Balanced) | Logistic Regression (Balanced) |
|----------------|---------------------------|----------------------------------|
| **Accuracy**        | 78.9%                        | 73.1%                           |
| **Churn Precision (Class 1)** | 0.63                         | 0.50                            |
| **Churn Recall (Class 1)**    | 0.49                         | **0.81** ✅                     |
| **Churn F1-score (Class 1)**  | 0.55                         | **0.62** ✅                     |

---

#### 🧠 Interpretation

- **Random Forest**:
  - High accuracy due to predicting non-churners well (majority class)
  - Low recall on churners → **misses over half the customers who leave**
  - Suitable for balanced business strategies but not ideal if retention is priority

- **Logistic Regression**:
  - Sacrifices overall accuracy but achieves **much higher recall** for churners
  - Ideal when the cost of missing a churner is high (e.g., customer retention efforts)
  - Easier to interpret and deploy in production

---

### ✅ Final Decision

**Logistic Regression (with class weighting)** was selected as the final model because:
- It captures **81% of actual churners**, which is critical for retention use-cases
- It demonstrates understanding of **class imbalance handling**
- It is lightweight and interpretable for real-world applications

---

### 🧩 Next Step

The final model is saved using `pickle` and integrated into a Streamlit app for user-friendly predictions via CSV upload.


In [22]:
import pickle

# Save the trained logistic regression model
with open('../models/logreg_model.pkl', 'wb') as f:
    pickle.dump(log_model, f)


In [23]:
# Save feature names used in training
feature_cols = X.columns.tolist()

with open('../models/feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)
