In [5]:
# Cell 0 — reload df and do the minimal cleaning we already decided earlier
import pandas as pd
import numpy as np

# 1) Load your CSV (use the same path you used before)
df = pd.read_csv(r'C:\Users\allur\OneDrive\Desktop\project\customer_churn_sample_3000.csv')

# 2) Convert TotalCharges to numeric and fill any NaNs from conversion
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# 3) Drop non-predictive ID if present
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)

# 4) Trim whitespace in text columns
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].astype(str).str.strip()

# 5) Ensure target exists and is 0/1 (handle case-insensitive 'churn')
churn_candidates = [c for c in df.columns if c.lower() == 'churn']
assert churn_candidates, f"Couldn't find a 'Churn' column. Available columns: {list(df.columns)}"
if churn_candidates[0] != 'Churn':
    df.rename(columns={churn_candidates[0]:'Churn'}, inplace=True)

if df['Churn'].dtype == 'object':
    df['Churn'] = df['Churn'].map({'Yes':1, 'No':0}).astype(int)

print("df ready. shape:", df.shape)
print("Columns (first 10):", list(df.columns)[:10])


df ready. shape: (3000, 20)
Columns (first 10): ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup']


In [6]:
# Cell A — memory check
CANDIDATES = ['best_model','xgb','rf','lr','model']
present = {n: type(globals()[n]).__name__ for n in CANDIDATES if n in globals()}
print("Models present:", present)
print("'X_train' exists:", 'X_train' in globals())


Models present: {'lr': 'LogisticRegression', 'model': 'LogisticRegression'}
'X_train' exists: True


In [7]:
# Cell B — quick baseline to restore model + X_train
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 1) sanity: df must exist
assert 'df' in globals(), "DataFrame df not found. Re-run your cleaning cell(s)."

# 2) ensure target is numeric 0/1
assert 'Churn' in df.columns, "Expected a 'Churn' column."
if df['Churn'].dtype == 'object':
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0}).astype(int)

# 3) one-hot encode remaining object columns (exclude target)
obj_cols = df.select_dtypes(include='object').columns.tolist()
if 'Churn' in obj_cols:
    obj_cols.remove('Churn')
df_enc = pd.get_dummies(df, columns=obj_cols, drop_first=True)

# 4) split
X = df_enc.drop(columns=['Churn'])
y = df_enc['Churn']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 5) optional scaling for common numeric columns (if present)
num_cols = [c for c in ['tenure','MonthlyCharges','TotalCharges'] if c in X_train.columns]
scaler = None
if num_cols:
    scaler = StandardScaler()
    X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test.loc[:,  num_cols] = scaler.transform(X_test[num_cols])

# 6) train a quick baseline model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# expose variables your save-cell expects
model = lr  # your save-cell will find this
print("Baseline LogisticRegression trained; model + X_train ready ")
print("X_train shape:", X_train.shape)


Baseline LogisticRegression trained; model + X_train ready 
X_train shape: (2400, 30)


 -0.4884222 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
 -0.10860421 -1.48544442 -0.15608146  1.03084976  0.65103177  0.84094077
  1.3631905   1.5530995  -1.43796717  0.84094077  0.69850902  1.17328151
 -0.86824019 -1.34301267  1.41066775 -0.77328569  0.03382754 -1.20058093
  1.74300849 -1.24805818 -0.91571743 -0.72580844 -1.29553542  0.55607728
  0.65103177 -1.34301267 -0.63085394  0.93589527 -0.72580844  1.458145
  0.36616828 -1.01067193 -1.15310368 -0.63085394 -1.05814918  0.12878204
  0.65103177 -0.2035587  -1.10562643  0.65103177 -1.05814918 -0.53589945
 -0.77328569 -0.2985132   0.22373653  1.74300849 -0.82076294 -0.77328569
  1.69553125  1.26823601 -0.2035587  -0.4884222   0.79346352  1.12580426
 -0.01364971  0.41364553  1.648054   -0.01364971  0.84094077  0.12878204
 -0.3934677   0.69850902  1.458145    1.74300849 -0.2035587  -1.10562643
 -0.10860421 -0.67833119 -1.

In [8]:
# === Sprint 5 · Cell 1: Save artifacts (model, scaler, feature schema) ===
from pathlib import Path
import joblib, json, time

# 1) pick your trained model variable automatically
CANDIDATES = ['best_model', 'xgb', 'rf', 'lr', 'model']
model = None
for name in CANDIDATES:
    if name in globals():
        model = globals()[name]
        print(f"Using trained model variable: {name}")
        break
assert model is not None, f"No trained model variable found. Expected one of: {CANDIDATES}"

# 2) get feature columns in the exact order used for training
assert 'X_train' in globals(), "X_train not found. Keep X_train in memory (with the final encoded columns) before saving artifacts."
feature_columns = list(X_train.columns)

# 3) optional scaler if you used one
scaler_obj = globals().get('scaler', None)

# 4) make models/ and build stamped filenames
Path("models").mkdir(exist_ok=True)
stamp = time.strftime("%Y%m%d_%H%M%S")
model_path  = f"models/model_{stamp}.pkl"
scaler_path = f"models/scaler_{stamp}.pkl" if scaler_obj is not None else None
schema_path = f"models/columns_{stamp}.json"

# 5) save artifacts
joblib.dump(model, model_path)
if scaler_obj is not None:
    joblib.dump(scaler_obj, scaler_path)

schema = {
    "feature_columns": feature_columns,
    "model_path": model_path,
    "scaler_path": scaler_path,
    "created_at": stamp
}
with open(schema_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2)

print("Saved:")
print(" -", model_path)
if scaler_path: print(" -", scaler_path)
print(" -", schema_path)


Using trained model variable: lr
Saved:
 - models/model_20250907_182632.pkl
 - models/scaler_20250907_182632.pkl
 - models/columns_20250907_182632.json
