In [24]:
!pip install  mlflow 

import sqlite3
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder




In [25]:
DB_PATH = "D:/BITS_SEM2/DMML_Assignment/Task6_FeatureStore/feature_store.db"   
TABLE_NAME = "feature_store"
TARGET_COL = "Churn"

In [26]:
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn)
conn.close()

print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")
print("Churn class distribution:")
display(df[TARGET_COL].value_counts())

Loaded 8456 rows and 10 columns
Churn class distribution:


Churn
No                                     5174
Yes                                    1869
b'\x00\x00\x00\x00\x00\x00\x00\x00'    1034
b'\x01\x00\x00\x00\x00\x00\x00\x00'     374
1                                         3
0                                         2
Name: count, dtype: int64

In [None]:

# --- Drop unnecessary columns ---
drop_cols = ["customerID"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# --- Clean Target Column ---
y = df[TARGET_COL].astype(str).str.strip().str.lower()

# Map different representations to 0/1
y = y.replace({
    "yes": 1, "no": 0,
    "1": 1, "0": 0,
    "true": 1, "false": 0,
    b"1": 1, b"0": 0,
    "\x00": 0,
    "\x00\x00\x00\x00\x00\x00\x00\x00": 0
})

# Convert to numeric, set invalids as NaN
y = pd.to_numeric(y, errors="coerce")

# Drop rows with invalid target
mask = y.notna()
dropped_rows = len(y) - mask.sum()
if dropped_rows > 0:
    print(f"Dropped {dropped_rows} rows due to invalid target values.")

X = df.drop(TARGET_COL, axis=1).loc[mask]
y = y.loc[mask].astype(int)

# --- One-hot encode categorical features ---
X = pd.get_dummies(X, drop_first=True)

# --- Sanity checks ---
print("Final Columns in X:", X.columns.tolist()[:10], "...")  # print only first 10
print("Target unique values:", y.unique())
print("Class distribution:\n", y.value_counts())
print("Any duplicate rows?:", df.duplicated().sum())

# --- Ensure target has at least 2 classes ---
if y.nunique() < 2:
    raise ValueError(f"Target column '{TARGET_COL}' must have at least 2 classes. Got: {y.unique()}")

# --- Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Models ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
}

# --- Train, evaluate, save ---
for name, model in models.items():
    model.fit(X_scaled, y)
    y_pred = model.predict(X_scaled)

    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, zero_division=0)
    rec = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)

    print(f"{name} --> Acc:{acc:.3f} | Prec:{prec:.3f} | Recall:{rec:.3f} | F1:{f1:.3f}")

    # Save model and scaler
    joblib.dump(model, f"{name}.pkl")
    joblib.dump(scaler, f"{name}_scaler.pkl")


Dropped 1408 rows due to invalid target values.
Final Columns in X: ['TotalCharges', 'tenure_-1.2774445836787656', 'tenure_-1.2367242199587352', 'tenure_-1.1960038562387048', 'tenure_-1.1552834925186746', 'tenure_-1.1145631287986442', 'tenure_-1.0738427650786138', 'tenure_-1.0331224013585834', 'tenure_-0.9924020376385532', 'tenure_-0.9516816739185228'] ...
Target unique values: [0 1]
Class distribution:
 Churn
0    5176
1    1872
Name: count, dtype: int64
Any duplicate rows?: 193
LogisticRegression --> Acc:0.854 | Prec:0.759 | Recall:0.659 | F1:0.705
RandomForest --> Acc:0.988 | Prec:0.975 | Recall:0.978 | F1:0.977
