In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

## Load Dataset
# Update the path to match your local environment
df = pd.read_csv("./data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [2]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
## Preprocessing: Drop rows with missing or empty fields
df = df.dropna()
df = df[df['TotalCharges'] != ' ']
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [4]:
## Create Derived Feature
#  Flag customers with fiber optic + monthly charges above median
df['HighValueFiber'] = ((df['InternetService'] == 'Fiber optic') & 
                        (df['MonthlyCharges'] > df['MonthlyCharges'].median())).astype(int)

## Encode Target and Features
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [5]:
# Select numeric features for simplicity
feature_cols = ['MonthlyCharges', 'tenure', 'TotalCharges', 'HighValueFiber']
X = df[feature_cols]
y = df['Churn']

## Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
## Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

## Predict
y_pred = model.predict(X_test_scaled)

In [7]:
## Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[896 137]
 [205 169]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1033
           1       0.55      0.45      0.50       374

    accuracy                           0.76      1407
   macro avg       0.68      0.66      0.67      1407
weighted avg       0.74      0.76      0.75      1407



In [10]:
import importlib, mlops

from mlops import ModelRegistry, FeatureStore, ModelMonitoring
importlib.reload(mlops)

feature_store = FeatureStore(
    table_name="telco_churn_features",
    df = df,
    target_col = "Churn",
    feature_cols = ['MonthlyCharges', 'tenure', 'TotalCharges', 'HighValueFiber'],
    entity_cols = ["customerID"],
    db_path = f"local/offline_datastore.duckdb",
    fs_path = "feature_store.yaml"
)
model_registry = ModelRegistry(
    experiment_name="TelcoCutomerChurn",
    model_name = "TelcoCutomerChurnModel",
    model = model,
    params = {"n_estimators": 100, "random_state": 42}  
)
model_monitoring = ModelMonitoring(
    feature_store,
    model_registry
)

In [11]:
from mlops.data_validate import DataValidator


# DataValidator(df).validate()

quality_metrics = feature_store.quality_metrics()
data_drift = feature_store.data_drift()

if not feature_store.is_data_quality_acceptable(quality_metrics, data_drift):
    raise Exception("Data Quality no acceptable !!!")

model_metrics = model_registry.evaluate(X_test_scaled, y_test)
prediction_drift = model_monitoring.prediction_drift()
concept_drift = model_monitoring.concept_drift()

metrics = model_metrics, quality_metrics, data_drift, prediction_drift, concept_drift

# if not model_registry.is_model_ready_to_register(*metrics):
#     raise Exception("Model quality not acceptable !!!")

model_registry.log_to_mlflow(df[feature_cols].head(5), "Churn", feature_cols, *metrics)

Data quality metrics: {'missing_values': 0, 'duplicate_rows': 0, 'feature_count': 4, 'sample_count': 7032}
Quality of the data is acceptable


Registered model 'TelcoCutomerChurnModel' already exists. Creating a new version of this model...
2025/06/16 09:48:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TelcoCutomerChurnModel, version 2
Created version '2' of model 'TelcoCutomerChurnModel'.


Logged to MLflow
🏃 View run victorious-stoat-607 at: http://localhost:5001/#/experiments/1/runs/f82aeef363814701b216a919ed9bddf1
🧪 View experiment at: http://localhost:5001/#/experiments/1
