# Telco Customer Churn ML Pipeline

This notebook implements a complete machine learning pipeline for predicting customer churn in a telecommunications company. The pipeline includes data loading, preprocessing, model training, and evaluation.

## 1. Import Required Libraries

Import essential Python libraries for data manipulation, visualization, and machine learning model development.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import joblib
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Dataset

Load the Telco Customer Churn dataset from the raw data directory into a pandas DataFrame.

In [None]:
import os

# Set working directory to the notebook directory
os.chdir(os.path.dirname(os.path.abspath(__file__)))
print(f"Current working directory: {os.getcwd()}")

/content/drive/MyDrive/datasets


In [None]:
import pandas as pd
import os
import subprocess

# First, let's find where the data file is
print("Current working directory:", os.getcwd())
print("\nDirectory contents:")
for root, dirs, files in os.walk('..'):
    level = root.replace('..', '').count(os.sep)
    if level > 2:  # Limit depth
        continue
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files
        print(f'{subindent}{file}')

# Try using find command to locate CSV
try:
    result = subprocess.run(['find', '..', '-name', '*.csv', '-type', 'f'], 
                           capture_output=True, text=True, timeout=5)
    csv_files = [f for f in result.stdout.strip().split('\n') if f]
    print(f"\nCSV files found: {csv_files}")
    
    if csv_files:
        df = pd.read_csv(csv_files[0])
        print(f"Loaded dataset from: {csv_files[0]}")
        print(f"Shape: {df.shape}")
    else:
        print("Creating sample data for testing...")
        # Create minimal test data if file not found
        df = pd.DataFrame({
            'tenure': [1, 2, 3],
            'MonthlyCharges': [29.85, 56.95, 53.85],
            'TotalCharges': [29.85, 1397.475, 108.15],
            'Churn': ['No', 'Yes', 'No']
        })
        print(f"Using sample data. Shape: {df.shape}")
except Exception as e:
    print(f"Error finding file: {e}")
    # Create minimal test data
    df = pd.DataFrame({
        'tenure': [1, 2, 3],
        'MonthlyCharges': [29.85, 56.95, 53.85],
        'TotalCharges': [29.85, 1397.475, 108.15],
        'Churn': ['No', 'Yes', 'No']
    })
    print(f"Using sample data. Shape: {df.shape}")

df.head()

Available data files: []


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Deep-Learning-Projects\\telco-customer-churn-ml-pipeline\\data\\raw\\telco_churn_dataset.csv'

## 3. Dataset Overview

Display fundamental information about the dataset structure, dimensions, and basic statistics.

In [5]:
# Display dataset shape
print("=" * 80)
print("DATASET SHAPE")
print("=" * 80)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print()

DATASET SHAPE
Number of rows: 7043
Number of columns: 21



In [6]:
# Display first few rows
print("=" * 80)
print("FIRST 5 ROWS")
print("=" * 80)
print(df.head())

FIRST 5 ROWS
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        

In [7]:
# Display dataset information
print("\n" + "=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
df.info()


DATASET INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  704

In [8]:
# Display statistical summary
print("\n" + "=" * 80)
print("STATISTICAL SUMMARY")
print("=" * 80)
print(df.describe())


STATISTICAL SUMMARY
       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


## Handling Missing Values

The Telco dataset contains hidden missing values represented as blank strings (" ")
instead of proper NaN values. Machine learning algorithms and imputers cannot detect
blank strings as missing data.

Therefore, we first convert all blank entries into NaN so that the preprocessing
pipeline (SimpleImputer) can handle them correctly.


In [9]:
import numpy as np

# check missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# replace blank strings with NaN
df = df.replace(" ", np.nan)

print("\nMissing values after replacing blanks:")
print(df.isnull().sum())


Missing values before cleaning:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Missing values after replacing blanks:
customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges    

## Converting TotalCharges to Numeric

The 'TotalCharges' column is incorrectly stored as an object (string) datatype due to
the presence of blank values. Machine learning models require numeric input.

We convert this column to a numeric datatype. Any non-convertible values will be
automatically converted into NaN, which will later be handled by the imputer.


In [10]:
# convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# verify datatype
print(df.dtypes['TotalCharges'])

# check new missing values created
print("\nMissing values in TotalCharges:")
print(df['TotalCharges'].isnull().sum())


float64

Missing values in TotalCharges:
11


## Feature‚ÄìTarget Separation

To train a machine learning model, we separate the dataset into:

X (features): all input variables describing a customer  
y (target): the variable we want to predict (Churn)

The model will learn the relationship between X and y.


In [11]:
# features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

print("Feature shape:", X.shape)
print("Target shape:", y.shape)

Feature shape: (7043, 20)
Target shape: (7043,)


## 8. Split Dataset into Training and Testing Sets

Split the data into training (80%) and testing (20%) sets using train_test_split with a fixed random state for reproducibility.

In [12]:
# Split dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Maintain class distribution
)

print(f"Training set size: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Testing set size: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTraining features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())
print(f"\nClass distribution in testing set:")
print(y_test.value_counts())

Training set size: 5634 samples (80.0%)
Testing set size: 1409 samples (20.0%)

Training features shape: (5634, 20)
Testing features shape: (1409, 20)

Class distribution in training set:
Churn
No     4139
Yes    1495
Name: count, dtype: int64

Class distribution in testing set:
Churn
No     1035
Yes     374
Name: count, dtype: int64


## 9. Automatically Detect Categorical and Numerical Feature Columns

Separate features into categorical and numerical columns based on their data types for appropriate preprocessing.

In [13]:
# Automatically detect categorical and numerical columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features ({len(numerical_features)}):")
print(numerical_features)
print(f"\nCategorical features ({len(categorical_features)}):")
print(categorical_features)

# Display data types
print(f"\nData types in training set:")
print(X_train.dtypes)
print(f"\nShape of training features: {X_train.shape}")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")

Numerical features (4):
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

Categorical features (16):
['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Data types in training set:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
dtype: object

Shape of training features: 

## 10. Create Numerical Preprocessing Pipeline

Create a preprocessing pipeline for numerical features using SimpleImputer (for missing values) and StandardScaler (for normalization).

In [16]:
# Import preprocessing tools
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Create numerical preprocessing pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler())  # Standardize features (mean=0, std=1)
])

print("Numerical Preprocessing Pipeline created:")
print(numerical_pipeline)
print(f"\nThis pipeline will be applied to {len(numerical_features)} numerical features:")
print(numerical_features)
print("\nPipeline steps:")
print("  1. SimpleImputer: Handles missing values using median strategy")
print("  2. StandardScaler: Normalizes features to have mean=0 and standard deviation=1")

Numerical Preprocessing Pipeline created:
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

This pipeline will be applied to 4 numerical features:
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

Pipeline steps:
  1. SimpleImputer: Handles missing values using median strategy
  2. StandardScaler: Normalizes features to have mean=0 and standard deviation=1


## 11. Create Categorical Preprocessing Pipeline

Create a preprocessing pipeline for categorical features using SimpleImputer (to handle missing values) and OneHotEncoder (to convert categories into numerical format).

In [17]:
# Create categorical preprocessing pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Convert to binary features
])

print("Categorical Preprocessing Pipeline created:")
print(categorical_pipeline)
print(f"\nThis pipeline will be applied to {len(categorical_features)} categorical features:")
print(categorical_features)
print("\nPipeline steps:")
print("  1. SimpleImputer: Handles missing values using most_frequent strategy")
print("  2. OneHotEncoder: Converts categorical variables into binary (one-hot) encoded features")

Categorical Preprocessing Pipeline created:
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot',
                 OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

This pipeline will be applied to 16 categorical features:
['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Pipeline steps:
  1. SimpleImputer: Handles missing values using most_frequent strategy
  2. OneHotEncoder: Converts categorical variables into binary (one-hot) encoded features


## 12. Combine Both Pipelines Using ColumnTransformer

Combine the numerical and categorical preprocessing pipelines into a single ColumnTransformer that applies the appropriate preprocessing to each feature type.

In [18]:
# Combine both preprocessing pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'  # Keep other columns as-is if any
)

print("ColumnTransformer created combining both pipelines:")
print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print("\nTransformers:")
print("  1. 'num': Applies numerical_pipeline to numerical features")
print("  2. 'cat': Applies categorical_pipeline to categorical features")

ColumnTransformer created combining both pipelines:

Numerical features (4): ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical features (16): ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Transformers:
  1. 'num': Applies numerical_pipeline to numerical features
  2. 'cat': Applies categorical_pipeline to categorical features


## 13. Build Full Pipeline with Preprocessing + Logistic Regression

Create a complete pipeline that combines the preprocessor with a Logistic Regression model. This pipeline will handle all data transformations and model training end-to-end.

In [19]:
# Build full pipeline with preprocessing + Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, verbose=0))
])

print("Full Logistic Regression Pipeline created:")
print("\nPipeline structure:")
print(f"  Step 1: Preprocessor (ColumnTransformer)")
print(f"    - Numerical features ‚Üí Impute (median) ‚Üí Scale (StandardScaler)")
print(f"    - Categorical features ‚Üí Impute (most_frequent) ‚Üí OneHotEncoder")
print(f"  Step 2: Classifier (LogisticRegression)")
print(f"    - Algorithm: Logistic Regression")
print(f"    - max_iter: 1000")
print(f"    - random_state: 42 (for reproducibility)")
print(f"\nThis pipeline will:")
print(f"  1. Automatically preprocess features based on their types")
print(f"  2. Train a Logistic Regression model on the preprocessed data")

Full Logistic Regression Pipeline created:

Pipeline structure:
  Step 1: Preprocessor (ColumnTransformer)
    - Numerical features ‚Üí Impute (median) ‚Üí Scale (StandardScaler)
    - Categorical features ‚Üí Impute (most_frequent) ‚Üí OneHotEncoder
  Step 2: Classifier (LogisticRegression)
    - Algorithm: Logistic Regression
    - max_iter: 1000
    - random_state: 42 (for reproducibility)

This pipeline will:
  1. Automatically preprocess features based on their types
  2. Train a Logistic Regression model on the preprocessed data


In [None]:
# Print best parameters and best CV score
print("="*80)
print("BEST HYPERPARAMETERS FOR LOGISTIC REGRESSION")
print("="*80)
print(f"\nBest Parameters:")
for param, value in lr_grid_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest Cross-Validation Score (accuracy): {lr_grid_search.best_score_:.4f}")
print(f"\nBest Estimator:")
print(lr_grid_search.best_estimator_)

## 17. Print Best Parameters and Best CV Score

Display the best hyperparameters found and the corresponding cross-validation score.

In [None]:
# Train GridSearchCV on training data
print("Training GridSearchCV... (This may take a few minutes)")
print("="*80)

lr_grid_search.fit(X_train, y_train)

print("="*80)
print("GridSearchCV training completed!")

Training GridSearchCV... (This may take a few minutes)
Fitting 5 folds for each of 54 candidates, totalling 270 fits


## 16. Train GridSearch on Training Data

Fit the GridSearchCV object on the training data to find the best hyperparameters.

In [21]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Create GridSearchCV object
lr_grid_search = GridSearchCV(
    estimator=lr_pipeline,
    param_grid=lr_param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Use accuracy as the scoring metric
    n_jobs=-1,  # Use all available processors
    verbose=1
)

print("GridSearchCV configured:")
print(f"  Estimator: Logistic Regression Pipeline")
print(f"  Param Grid: 54 combinations")
print(f"  Cross-Validation: 5-fold")
print(f"  Scoring Metric: accuracy")
print(f"  N_jobs: -1 (use all processors)")
print(f"\nGridSearchCV will test all hyperparameter combinations using cross-validation")

GridSearchCV configured:
  Estimator: Logistic Regression Pipeline
  Param Grid: 54 combinations
  Cross-Validation: 5-fold
  Scoring Metric: accuracy
  N_jobs: -1 (use all processors)

GridSearchCV will test all hyperparameter combinations using cross-validation


## 15. Run GridSearchCV with Cross-Validation

Create and configure GridSearchCV to find the best hyperparameters using 5-fold cross-validation.

In [20]:
# Define hyperparameter grid for Logistic Regression
lr_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse regularization strength
    'classifier__solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization algorithm
    'classifier__max_iter': [500, 1000, 2000]  # Max iterations for convergence
}

print("Logistic Regression Hyperparameter Grid defined:")
print(f"\nC (Inverse Regularization Strength): {lr_param_grid['classifier__C']}")
print(f"Solver (Optimization Algorithm): {lr_param_grid['classifier__solver']}")
print(f"Max Iterations: {lr_param_grid['classifier__max_iter']}")
print(f"\nTotal combinations to test: {len(lr_param_grid['classifier__C']) * len(lr_param_grid['classifier__solver']) * len(lr_param_grid['classifier__max_iter'])} = {6 * 3 * 3}")

Logistic Regression Hyperparameter Grid defined:

C (Inverse Regularization Strength): [0.001, 0.01, 0.1, 1, 10, 100]
Solver (Optimization Algorithm): ['lbfgs', 'liblinear', 'saga']
Max Iterations: [500, 1000, 2000]

Total combinations to test: 54 = 54


## 14. Define Hyperparameter Grid for Logistic Regression

Define a grid of hyperparameters to test for optimizing the Logistic Regression model.

In [4]:
# Check if GridSearch objects have been created and trained
if 'grid_search_rf' not in locals() or 'grid_search_lr' not in locals():
    print("‚è≥ WARNING: GridSearchCV models are still training or not yet executed.")
    print("Please wait for the GridSearchCV cells (21 and 26) to complete first.")
    print("\nExpected training time: 30-60 minutes depending on your system.")
    print("\nYou can check the status of running cells in the notebook output.")
else:
    try:
        # Evaluate Random Forest on test set
        y_pred_rf = grid_search_rf.predict(X_test)
        y_pred_proba_rf = grid_search_rf.predict_proba(X_test)[:, 1]

        accuracy_rf = accuracy_score(y_test, y_pred_rf)
        precision_rf = precision_score(y_test, y_pred_rf, pos_label='Yes')
        recall_rf = recall_score(y_test, y_pred_rf, pos_label='Yes')
        f1_rf = f1_score(y_test, y_pred_rf, pos_label='Yes')
        roc_auc_rf = roc_auc_score(y_test.map({'No': 0, 'Yes': 1}), y_pred_proba_rf)

        # Create comparison dataframe
        comparison_data = {
            'Model': ['Logistic Regression', 'Random Forest'],
            'Accuracy': [accuracy_lr, accuracy_rf],
            'Precision': [precision_lr, precision_rf],
            'Recall': [recall_lr, recall_rf],
            'F1-Score': [f1_lr, f1_rf],
            'ROC-AUC': [roc_auc_lr, roc_auc_rf]
        }

        comparison_df = pd.DataFrame(comparison_data)

        print("\n" + "=" * 100)
        print("MODEL COMPARISON - LOGISTIC REGRESSION vs RANDOM FOREST")
        print("=" * 100)
        print(comparison_df.to_string(index=False))

        # Select better model
        best_model_name = 'Random Forest' if accuracy_rf > accuracy_lr else 'Logistic Regression'
        best_model = grid_search_rf if accuracy_rf > accuracy_lr else grid_search_lr

        print(f"\n\nüèÜ BEST MODEL: {best_model_name}")
        print(f"   Accuracy: {max(accuracy_rf, accuracy_lr):.4f}")
        
    except NameError as e:
        print(f"‚ùå Error: Missing required variables: {str(e)}")
        print("\nMake sure you have run ALL previous cells in order:")
        print("  1. GridSearchCV for Logistic Regression (cell 26)")
        print("  2. Train GridSearchCV on training data (cell 27)")
        print("  3. Make predictions on test set (cell 29)")
        print("  4. Evaluate Logistic Regression (cell 31)")
        print("  5. Random Forest pipeline, hyperparameters, and GridSearchCV cells")
    except Exception as e:
        print(f"‚ùå Error during evaluation: {str(e)}")
        print("\nPlease ensure all dependent cells have completed successfully.")

Please wait for the GridSearchCV cells (21 and 26) to complete first.

Expected training time: 30-60 minutes depending on your system.

You can check the status of running cells in the notebook output.


In [5]:
print("=" * 80)
print("STREAMLIT DEPLOYMENT GUIDE")
print("=" * 80)
print("""
To deploy this model as a web application, follow these steps:

1. Install Streamlit:
   pip install streamlit

2. Create a file 'streamlit_app.py' with the following structure:

   import streamlit as st
   import joblib
   import pandas as pd
   import numpy as np

   # Load the saved pipeline
   pipeline = joblib.load('churn_pipeline.joblib')

   st.title('Telco Customer Churn Predictor')

   # Create input fields for customer data
   gender = st.selectbox('Gender', ['Male', 'Female'])
   tenure = st.number_input('Tenure (months)', 0, 72, value=12)
   monthly_charges = st.number_input('Monthly Charges', 0.0, 200.0, value=89.45)
   # ... add more input fields for all features ...

   # Make prediction
   if st.button('Predict Churn'):
       customer_data = pd.DataFrame({...})  # Organize all inputs
       prediction = pipeline.predict(customer_data)[0]
       probability = pipeline.predict_proba(customer_data)[0]
       
       if prediction == 'Yes':
           st.error(f'‚ö†Ô∏è Customer is likely to CHURN')
       else:
           st.success(f'‚úÖ Customer is likely to STAY')

3. Run the app:
   streamlit run streamlit_app.py

4. Access the app in your browser at: http://localhost:8501
""")

STREAMLIT DEPLOYMENT GUIDE

To deploy this model as a web application, follow these steps:

1. Install Streamlit:
   pip install streamlit

2. Create a file 'streamlit_app.py' with the following structure:

   import streamlit as st
   import joblib
   import pandas as pd
   import numpy as np

   # Load the saved pipeline
   pipeline = joblib.load('churn_pipeline.joblib')

   st.title('Telco Customer Churn Predictor')

   # Create input fields for customer data
   gender = st.selectbox('Gender', ['Male', 'Female'])
   tenure = st.number_input('Tenure (months)', 0, 72, value=12)
   monthly_charges = st.number_input('Monthly Charges', 0.0, 200.0, value=89.45)
   # ... add more input fields for all features ...

   # Make prediction
   if st.button('Predict Churn'):
       customer_data = pd.DataFrame({...})  # Organize all inputs
       prediction = pipeline.predict(customer_data)[0]
       probability = pipeline.predict_proba(customer_data)[0]
       
       if prediction == 'Yes':
   

## 26. (BONUS) Streamlit Deployment

To deploy this model as a web app using Streamlit, create a separate `streamlit_app.py` file with the following code structure. This allows real-time predictions through an interactive UI.

In [7]:
# Check if pipeline has been loaded
if 'loaded_pipeline' not in locals():
    print("‚è≥ WARNING: Pipeline not yet loaded")
    print("\nPlease make sure you have executed these cells in order:")
    print("  1. Cell 40: Save the Best Pipeline using joblib")
    print("  2. Cell 42: Load the Saved Pipeline")
    print("\nYou can skip the GridSearchCV cells and use the provided sample pipeline instead.")
    print("\nTo use a sample trained model for testing predictions:")
    print("  - Use: from src.predict import create_sample_customer, predict_customer")
    print("  - Then test with sample data")
else:
    try:
        # Create a sample new customer (raw unprocessed data)
        new_customer = pd.DataFrame({
            'customerID': ['CUST12345'],
            'gender': ['Male'],
            'SeniorCitizen': [0],
            'Partner': ['No'],
            'Dependents': ['No'],
            'tenure': [12],
            'PhoneService': ['Yes'],
            'MultipleLines': ['No'],
            'InternetService': ['Fiber optic'],
            'OnlineSecurity': ['No'],
            'OnlineBackup': ['No'],
            'DeviceProtection': ['Yes'],
            'TechSupport': ['No'],
            'StreamingTV': ['Yes'],
            'StreamingMovies': ['No'],
            'Contract': ['Month-to-month'],
            'PaperlessBilling': ['Yes'],
            'PaymentMethod': ['Electronic check'],
            'MonthlyCharges': [89.45],
            'TotalCharges': [1185.50]
        })

        print("=" * 80)
        print("NEW CUSTOMER SAMPLE")
        print("=" * 80)
        print("\nCustomer Details (Raw Input):")
        print(new_customer.to_string(index=False))

        # Make prediction using the loaded pipeline
        churn_prediction = loaded_pipeline.predict(new_customer)[0]
        churn_probability = loaded_pipeline.predict_proba(new_customer)[0]

        print(f"\n\n{'=' * 80}")
        print("PREDICTION RESULT")
        print(f"{'=' * 80}")
        print(f"\nChurn Prediction: {churn_prediction}")
        print(f"  - Probability of NOT churning (No): {churn_probability[list(loaded_pipeline.classes_).index('No')]:.4f}")
        print(f"  - Probability of CHURNING (Yes): {churn_probability[list(loaded_pipeline.classes_).index('Yes')]:.4f}")

        # Human readable output
        if churn_prediction == 'Yes':
            print(f"\n‚ö†Ô∏è  WARNING: This customer is likely to CHURN")
            print(f"    Recommended action: Provide retention offer")
        else:
            print(f"\n‚úÖ GOOD NEWS: This customer is likely to STAY")
            print(f"    Recommended action: Continue standard service")
            
    except Exception as e:
        print(f"‚ùå Error during prediction: {str(e)}")
        print("\nPlease ensure:")
        print("  1. The pipeline file exists and is valid")
        print("  2. All customer columns match the training data")
        print("  3. Previous cells have executed successfully")


Please make sure you have executed these cells in order:
  1. Cell 40: Save the Best Pipeline using joblib
  2. Cell 42: Load the Saved Pipeline

You can skip the GridSearchCV cells and use the provided sample pipeline instead.

To use a sample trained model for testing predictions:
  - Use: from src.predict import create_sample_customer, predict_customer
  - Then test with sample data


## 25. Create Sample Customer Input and Make Predictions

Create a new customer sample in raw (unprocessed) format and use the loaded pipeline to predict churn probability.

In [10]:
# Check if model_path has been defined
if 'model_path' not in locals():
    print("‚è≥ WARNING: Model path not yet defined")
    print("\nPlease run Cell 36 (Save the Best Pipeline) first.")
    print("\nDependency chain:")
    print("  1. GridSearchCV cells (21 & 26) must complete ‚úì/‚è≥")
    print("  2. Cell 40 - Model Comparison")
    print("  3. Cell 47 - Save the Best Pipeline")
    print("  4. Cell 45 - Load the Saved Pipeline (current)")
    print("\nExpected wait time: 30-60 minutes for GridSearchCV to complete.")
    print("\nOnce Cell 47 saves the model, this cell will load it successfully.")
else:
    try:
        # Load the saved pipeline
        loaded_pipeline = joblib.load(model_path)

        print("=" * 80)
        print("PIPELINE LOADED")
        print("=" * 80)
        print(f"\nPipeline loaded successfully from: {model_path}")
        print(f"\nLoaded pipeline structure:")
        print(loaded_pipeline)
        print(f"\nThis loaded pipeline is ready for making predictions on new customers!")
        
    except FileNotFoundError as e:
        print(f"‚ùå Error: Model file not found at {model_path}")
        print("\nMake sure Cell 47 (Save the Best Pipeline) has been executed.")
    except Exception as e:
        print(f"‚ùå Error loading pipeline: {str(e)}")
        print("\nPlease ensure the model file is valid and all dependencies are complete.")


Please run Cell 36 (Save the Best Pipeline) first.

Dependency chain:
  1. GridSearchCV cells (21 & 26) must complete ‚úì/‚è≥
  2. Cell 40 - Model Comparison
  3. Cell 47 - Save the Best Pipeline
  4. Cell 45 - Load the Saved Pipeline (current)

Expected wait time: 30-60 minutes for GridSearchCV to complete.

Once Cell 47 saves the model, this cell will load it successfully.


## 24. Load the Saved Pipeline

Load the saved pipeline from the joblib file to demonstrate how it would be used in production.

In [12]:
# Check if best_model exists
if 'best_model' not in locals() or 'best_model_name' not in locals():
    print("‚è≥ WARNING: Best model not yet available")
    print("\nPlease run Cell 40 (Model Comparison) first.")
    print("\nDependency chain:")
    print("  1. GridSearchCV cells (21 & 26) must complete ‚úì/‚è≥")
    print("  2. Cell 30 - Make predictions on test set")
    print("  3. Cell 32 - Evaluate Logistic Regression")
    print("  4. Cell 36 - Random Forest pipeline and hyperparameters")
    print("  5. Cell 40 - Model Comparison (creates best_model)")
    print("  6. Cell 47 - Save the Best Pipeline (current)")
    print("\nExpected wait time: 30-60 minutes for GridSearchCV to complete.")
    print("\nOnce Cell 40 identifies the best model, this cell will save it successfully.")
else:
    try:
        # Define the save path
        model_path = './models/churn_pipeline.joblib'
        
        # Create models directory if it doesn't exist
        import os
        os.makedirs('./models', exist_ok=True)
        
        # Save the best model
        joblib.dump(best_model, model_path)

        print("=" * 80)
        print("MODEL SAVED")
        print("=" * 80)
        print(f"\nBest Model: {best_model_name}")
        print(f"Model saved to: {model_path}")
        print(f"File size: {os.path.getsize(model_path) / 1024:.2f} KB")
        print(f"\nThis pipeline contains:")
        print(f"  - ColumnTransformer (preprocessor)")
        print(f"  - Classifier ({best_model_name})")
        print(f"  - All trained parameters and weights")
        
    except NameError as e:
        print(f"‚ùå Error: Missing required variables: {str(e)}")
        print("\nMake sure Cell 40 (Model Comparison) has been executed successfully.")
    except Exception as e:
        print(f"‚ùå Error saving model: {str(e)}")
        print("\nPlease ensure: ")
        print("  1. The best_model and best_model_name variables are defined")
        print("  2. You have write permissions to the current directory")
        print("  3. The models directory can be created")


Please run Cell 40 (Model Comparison) first.

Dependency chain:
  1. GridSearchCV cells (21 & 26) must complete ‚úì/‚è≥
  2. Cell 30 - Make predictions on test set
  3. Cell 32 - Evaluate Logistic Regression
  4. Cell 36 - Random Forest pipeline and hyperparameters
  5. Cell 40 - Model Comparison (creates best_model)
  6. Cell 47 - Save the Best Pipeline (current)

Expected wait time: 30-60 minutes for GridSearchCV to complete.

Once Cell 40 identifies the best model, this cell will save it successfully.


## 23. Save the Best Pipeline using joblib

Save the best trained pipeline to a joblib file for later use in production.

## 22. Compare Logistic Regression vs Random Forest

Compare performance of both models and select the better one.

In [None]:
from sklearn.model_selection import GridSearchCV

# Check if prerequisites exist
if 'rf_pipeline' not in locals() or 'param_grid_rf' not in locals():
    print("‚è≥ WARNING: Random Forest Pipeline not yet created")
    print("\nPlease run Cell 54 (Create Random Forest Pipeline) first.")
    print("\nDependency chain:")
    print("  1. Cell 23 - Detect feature types")
    print("  2. Cell 25 - Create numerical pipeline")
    print("  3. Cell 27 - Create categorical pipeline")
    print("  4. Cell 29 - Combine with ColumnTransformer")
    print("  5. Cell 54 - Create Random Forest Pipeline")
    print("  6. Cell 52 - Define Random Forest Hyperparameter Grid")
    print("  7. Cell 50 - Run GridSearchCV for Random Forest (current)")
else:
    try:
        if 'X_train' not in locals() or 'y_train' not in locals():
            print("‚ùå Error: Training data not available")
            print("Please run data loading and preprocessing cells first.")
        else:
            # Run GridSearchCV for Random Forest
            grid_search_rf = GridSearchCV(
                rf_pipeline,
                param_grid_rf,
                cv=5,
                scoring='f1_weighted',
                n_jobs=-1,
                verbose=1
            )

            print("Starting Random Forest GridSearchCV...")
            print(f"CV folds: 5")
            print(f"Scoring metric: f1_weighted")
            print(f"Total combinations to test: {len(param_grid_rf['classifier__n_estimators']) * len(param_grid_rf['classifier__max_depth']) * len(param_grid_rf['classifier__min_samples_split']) * len(param_grid_rf['classifier__min_samples_leaf'])}")
            print(f"This may take 30-60 minutes depending on your system...")
            print()
            
            grid_search_rf.fit(X_train, y_train)
            print("\nRandom Forest GridSearchCV completed!")

            # Print best parameters and score
            print(f"\nBest Random Forest Parameters:")
            for param, value in grid_search_rf.best_params_.items():
                print(f"  {param}: {value}")
            print(f"\nBest CV Score (F1-Weighted): {grid_search_rf.best_score_:.4f}")
            
    except Exception as e:
        print(f"‚ùå Error during Random Forest GridSearchCV: {str(e)}")
        print("\nPlease ensure:")
        print("  1. rf_pipeline is defined")
        print("  2. param_grid_rf is defined")
        print("  3. X_train and y_train are available")
        print("  4. All preprocessing has been completed successfully")

NameError: name 'rf_pipeline' is not defined

## 21. Run GridSearchCV for Random Forest

Execute GridSearchCV to find optimal hyperparameters for Random Forest.

In [1]:
# Random Forest hyperparameter grid
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}

print("Random Forest Hyperparameter Grid defined:")
print(f"\nn_estimators: {param_grid_rf['classifier__n_estimators']}")
print(f"max_depth: {param_grid_rf['classifier__max_depth']}")
print(f"min_samples_split: {param_grid_rf['classifier__min_samples_split']}")
print(f"min_samples_leaf: {param_grid_rf['classifier__min_samples_leaf']}")
print(f"\nTotal combinations: {len(param_grid_rf['classifier__n_estimators']) * len(param_grid_rf['classifier__max_depth']) * len(param_grid_rf['classifier__min_samples_split']) * len(param_grid_rf['classifier__min_samples_leaf'])}")

Random Forest Hyperparameter Grid defined:

n_estimators: [50, 100, 200]
max_depth: [10, 20, 30, None]
min_samples_split: [2, 5, 10]
min_samples_leaf: [1, 2, 4]

Total combinations: 108


## 20. Define Random Forest Hyperparameter Grid

Define hyperparameter grid for Random Forest tuning with GridSearchCV.

In [None]:
# Create Random Forest Pipeline
from sklearn.pipeline import Pipeline

if 'preprocessor' not in locals():
    print("‚è≥ WARNING: Preprocessor not yet available")
    print("\nPlease run previous cells to create the preprocessor:")
    print("  1. Cell 25 - Create numerical pipeline")
    print("  2. Cell 27 - Create categorical pipeline")
    print("  3. Cell 29 - Combine with ColumnTransformer")
    print("\nOnce the preprocessor is created, this cell will build the RF pipeline.")
else:
    try:
        rf_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
        ])

        print("Random Forest Pipeline created:")
        print("\nPipeline structure:")
        print(f"  Step 1: Preprocessor (ColumnTransformer)")
        print(f"  Step 2: Classifier (RandomForestClassifier)")
        print(f"\nRandom Forest Parameters:")
        print(f"  - n_estimators: 100 (default)")
        print(f"  - max_depth: None (default)")
        print(f"  - min_samples_split: 2 (default)")
        print(f"  - min_samples_leaf: 1 (default)")
        print(f"  - random_state: 42 (for reproducibility)")
        
    except Exception as e:
        print(f"‚ùå Error creating Random Forest Pipeline: {str(e)}")
        print("\nPlease ensure preprocessor is defined and all dependencies are complete")

NameError: name 'preprocessor' is not defined

## 19. Create Random Forest Pipeline

Create a new pipeline with Random Forest classifier for comparison with Logistic Regression.

In [None]:
# Evaluate Logistic Regression
from sklearn.metrics import precision_score, recall_score, f1_score

if 'y_pred_lr' not in locals() or 'y_pred_proba_lr' not in locals():
    print("‚è≥ WARNING: Predictions not yet available")
    print("\nPlease run Cell 58 (Make predictions on test set) first.")
    print("\nThis cell depends on:")
    print("  1. y_pred_lr - predictions from Logistic Regression")
    print("  2. y_pred_proba_lr - prediction probabilities")
    print("\nExpected wait time: 15-30 minutes for GridSearchCV to complete.")
else:
    try:
        accuracy_lr = accuracy_score(y_test, y_pred_lr)
        precision_lr = precision_score(y_test, y_pred_lr, pos_label='Yes')
        recall_lr = recall_score(y_test, y_pred_lr, pos_label='Yes')
        f1_lr = f1_score(y_test, y_pred_lr, pos_label='Yes')
        roc_auc_lr = roc_auc_score(y_test.map({'No': 0, 'Yes': 1}), y_pred_proba_lr)

        print("\n" + "=" * 80)
        print("LOGISTIC REGRESSION - MODEL EVALUATION")
        print("=" * 80)
        print(f"\nAccuracy:   {accuracy_lr:.4f}")
        print(f"Precision:  {precision_lr:.4f}")
        print(f"Recall:     {recall_lr:.4f}")
        print(f"F1-Score:   {f1_lr:.4f}")
        print(f"ROC-AUC:    {roc_auc_lr:.4f}")

        print(f"\n\nConfusion Matrix:")
        cm = confusion_matrix(y_test, y_pred_lr)
        print(cm)

        print(f"\n\nDetailed Classification Report:")
        print(classification_report(y_test, y_pred_lr))
        
    except Exception as e:
        print(f"‚ùå Error during evaluation: {str(e)}")
        print("\nPlease ensure:")
        print("  1. Predictions (y_pred_lr, y_pred_proba_lr) are available")
        print("  2. y_test data is available")
        print("  3. All previous cells have executed successfully")

NameError: name 'y_test' is not defined

## 18. Evaluate Logistic Regression Model

Evaluate the Logistic Regression model using accuracy, precision, recall, F1-score, and confusion matrix.

In [None]:
# Make predictions on test set
if 'grid_search_lr' not in locals():
    print("‚è≥ WARNING: Logistic Regression GridSearchCV not yet available")
    print("\nPlease run Cell 26 (Define LR Hyperparameter Grid) first.")
    print("\nDependency chain:")
    print("  1. Cell 31 - Build LR pipeline")
    print("  2. Cell 38 - Define LR hyperparameter grid")
    print("  3. Cell 36 - Configure LR GridSearchCV")
    print("  4. Cell 34 - Train LR GridSearchCV on training data")
    print("  5. Cell 58 - Make predictions on test set (current)")
    print("\nExpected wait time: 15-30 minutes for GridSearchCV to complete.")
else:
    try:
        y_pred_lr = grid_search_lr.predict(X_test)
        y_pred_proba_lr = grid_search_lr.predict_proba(X_test)[:, 1]

        print("=" * 80)
        print("LOGISTIC REGRESSION - TEST SET PREDICTIONS")
        print("=" * 80)
        print(f"\nTotal test samples: {len(y_test)}")
        print(f"Predicted churners: {sum(y_pred_lr == 'Yes')}")
        print(f"Predicted non-churners: {sum(y_pred_lr == 'No')}")
        print(f"\nFirst 10 predictions:")
        print(y_pred_lr[:10])
        print(f"\nFirst 10 prediction probabilities:")
        print(y_pred_proba_lr[:10])
        
    except Exception as e:
        print(f"‚ùå Error during prediction: {str(e)}")
        print("\nPlease ensure:")
        print("  1. grid_search_lr has completed training")
        print("  2. X_test and y_test are available")
        print("  3. All preprocessing has been completed")

NameError: name 'grid_search_lr' is not defined

## 17. Predict Churn on Test Dataset

Use the best trained Logistic Regression model to make predictions on the test dataset.