In [1]:
# Install required libraries for data processing, modeling, and pipeline creation
!pip install pandas numpy scikit-learn joblib



In [2]:
import pandas as pd
import urllib.request

# Download the Telco Customer Churn dataset from a public URL
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
urllib.request.urlretrieve(url, "Telco-Customer-Churn.csv")

# Load the dataset into a pandas DataFrame
df = pd.read_csv("Telco-Customer-Churn.csv")

# Clean up specific columns
# Replace 'No internet service' and 'No phone service' with 'No' for consistency
df = df.replace(['No internet service', 'No phone service'], 'No')

# Convert TotalCharges to numeric, setting invalid values to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Display first few rows to verify
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

  MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0            No             DSL             No  ...               No   
1            No             DSL            Yes  ...              Yes   
2            No             DSL            Yes  ...               No   
3            No             DSL            Yes  ...              Yes   
4            No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract

In [3]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
# Drop customerID (irrelevant) and Churn (target)
X = df.drop(['customerID', 'Churn'], axis=1)
# Convert Churn to binary (1 = Churn, 0 = No Churn)
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Print shapes to confirm split
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

Training set shape: (5634, 19), (5634,)
Test set shape: (1409, 19), (1409,)


In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Identify numerical and categorical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [col for col in X.columns if col not in numerical_cols]

# Numerical pipeline: impute missing values (median), then scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: impute missing values (most frequent), then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Test preprocessor on a small sample
print("Preprocessing columns:")
print(f"Numerical: {numerical_cols}")
print(f"Categorical: {categorical_cols}")

Preprocessing columns:
Numerical: ['tenure', 'MonthlyCharges', 'TotalCharges']
Categorical: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Define models
models = {
    'logistic_regression': LogisticRegression(random_state=42, max_iter=1000),
    'random_forest': RandomForestClassifier(random_state=42)
}

# Define hyperparameter grids for GridSearchCV
param_grid = {
    'logistic_regression': {
        'model__C': [0.01, 0.1, 1, 10],  # Regularization strength
        'model__penalty': ['l2']          # L2 regularization
    },
    'random_forest': {
        'model__n_estimators': [100, 200],      # Number of trees
        'model__max_depth': [10, 20, None],     # Max depth of trees
        'model__min_samples_split': [2, 5]      # Min samples to split
    }
}

# Print model and parameter info
print("Models to train:", list(models.keys()))
print("Hyperparameters for tuning:", param_grid)

Models to train: ['logistic_regression', 'random_forest']
Hyperparameters for tuning: {'logistic_regression': {'model__C': [0.01, 0.1, 1, 10], 'model__penalty': ['l2']}, 'random_forest': {'model__n_estimators': [100, 200], 'model__max_depth': [10, 20, None], 'model__min_samples_split': [2, 5]}}


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# Store results
results = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Create pipeline: preprocessing + model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        pipeline,
        param_grid[model_name],
        cv=5,               # 5-fold cross-validation
        scoring='f1',       # Optimize for F1-score (good for imbalanced data)
        n_jobs=-1           # Use all CPU cores
    )

    # Train the model
    grid_search.fit(X_train, y_train)

    # Get best model
    best_model = grid_search.best_estimator_

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'f1_score': f1,
        'classification_report': classification_report(y_test, y_pred, target_names=['No Churn', 'Churn'])
    }

    # Print results
    print(f"\nResults for {model_name}:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Classification Report:\n{results[model_name]['classification_report']}")


Results for logistic_regression:
Best Parameters: {'model__C': 10, 'model__penalty': 'l2'}
Accuracy: 0.8048
F1-Score: 0.6032
Classification Report:
              precision    recall  f1-score   support

    No Churn       0.85      0.89      0.87      1035
       Churn       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409


Results for random_forest:
Best Parameters: {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Accuracy: 0.7984
F1-Score: 0.5786
Classification Report:
              precision    recall  f1-score   support

    No Churn       0.84      0.90      0.87      1035
       Churn       0.65      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [7]:
import joblib
import json

# Save each model's pipeline
for model_name, model in models.items():
    best_model = grid_search.best_estimator_  # From the last grid_search (update if needed)
    joblib.dump(best_model, f'telco_churn_{model_name}_pipeline.joblib')
    print(f"Saved pipeline: telco_churn_{model_name}_pipeline.joblib")

# Save evaluation results to JSON
with open('telco_churn_results.json', 'w') as f:
    json.dump(results, f, indent=4)
print("Results saved to: telco_churn_results.json")

Saved pipeline: telco_churn_logistic_regression_pipeline.joblib
Saved pipeline: telco_churn_random_forest_pipeline.joblib
Results saved to: telco_churn_results.json


In [8]:
# Load the Random Forest pipeline
loaded_pipeline = joblib.load('telco_churn_random_forest_pipeline.joblib')

# Test on a few samples from the test set
sample_data = X_test.iloc[:5]
predictions = loaded_pipeline.predict(sample_data)

# Print predictions
print("\nSample predictions from loaded Random Forest pipeline:")
for idx, pred in enumerate(predictions):
    print(f"Sample {idx+1}: {'Churn' if pred == 1 else 'No Churn'}")


Sample predictions from loaded Random Forest pipeline:
Sample 1: No Churn
Sample 2: Churn
Sample 3: No Churn
Sample 4: No Churn
Sample 5: No Churn
