In [1]:
!pip install  mlflow 

import sqlite3
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Collecting mlflow
  Downloading mlflow-3.3.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.1 (from mlflow)
  Downloading mlflow_skinny-3.3.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.1 (from mlflow)
  Downloading mlflow_tracing-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.3.1->mlflow)
  Downloading databricks_sdk-0.64.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.3.1->mlflow)
  Downloading opentelemetry_sdk-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting sqlparse<1,>=0.4.0 (from mlflow-skinny==3.3.1->mlflow)
  Downloading sqlparse-0.5.3-py3-no

In [23]:
DB_PATH = "D:/BITS_SEM2/DMML_Assignment/Task6_FeatureStore/feature_store.db"   
TABLE_NAME = "feature_store"
TARGET_COL = "Churn"

In [24]:
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn)
conn.close()

print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")
print("Churn class distribution:")
display(df[TARGET_COL].value_counts())

Loaded 5 rows and 10 columns
Churn class distribution:


Churn
1    3
0    2
Name: count, dtype: int64

In [26]:
import joblib
drop_cols = ["customerID"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Separate features and target
y = df[TARGET_COL]
X = df.drop(TARGET_COL, axis=1)

# One-hot encode categorical features (if any)
X = pd.get_dummies(X, drop_first=True)

# Check if target has at least 2 classes
if y.nunique() < 2:
    raise ValueError(f"Target column '{TARGET_COL}' must have at least 2 classes. Got: {y.unique()}")

# Split into train/test sets (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"{name} --> Acc:{acc:.3f} | Prec:{prec:.3f} | Recall:{rec:.3f} | F1:{f1:.3f}")
    
    # Save model as .pkl
    joblib.dump(model, f"{name}.pkl")

LogisticRegression --> Acc:1.000 | Prec:1.000 | Recall:1.000 | F1:1.000
RandomForest --> Acc:1.000 | Prec:1.000 | Recall:1.000 | F1:1.000
