<a href="https://colab.research.google.com/github/pradeepsai7/OIBSIP/blob/main/Project_2_Proposal_Level_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import joblib
import shap
import os

df = pd.read_csv("ifood_df.csv")
df = df.copy()
if 'Response' not in df.columns:
    raise SystemExit("Target column 'Response' not found")
if 'DtCustomer' in df.columns:
    df['DtCustomer'] = pd.to_datetime(df['DtCustomer'], dayfirst=True, errors='coerce')
    if df['DtCustomer'].notnull().any():
        max_date = df['DtCustomer'].max()
        df['CustomerTenureDays'] = (max_date - df['DtCustomer']).dt.days
        df['CustomerTenureDays'].fillna(df['CustomerTenureDays'].median(), inplace=True)
df.drop_duplicates(inplace=True)
count_cols = ['Kidhome','Teenhome','NumDealsPurchases','NumCatalogPurchases','NumStorePurchases','NumWebPurchases','NumWebVisitsMonth']
for c in count_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)
spend_cols = [c for c in df.columns if c.startswith('Mnt')]
for c in spend_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0)
if 'Income' in df.columns:
    df['Income'] = pd.to_numeric(df['Income'], errors='coerce')
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ['Response']]
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
drop_cols = []
for c in ['DtCustomer','ID','CustomerID','someIDcolumn']:
    if c in df.columns:
        drop_cols.append(c)
df = df.drop(columns=drop_cols, errors='ignore')
df.fillna({'Education':'Unknown','Marital':'Unknown'}, inplace=True)
df['TotalSpent'] = df[[c for c in spend_cols if c in df.columns]].sum(axis=1)
purchase_cols = [c for c in ['NumDealsPurchases','NumCatalogPurchases','NumStorePurchases','NumWebPurchases'] if c in df.columns]
df['TotalPurchases'] = df[purchase_cols].sum(axis=1) if purchase_cols else 0
df['AvgSpendPerPurchase'] = df['TotalSpent'] / df['TotalPurchases'].replace(0,1)
if all(x in df.columns for x in ['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5']):
    df['AcceptedPrior'] = df[['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5']].sum(axis=1)
df['HasChildren'] = ((df['Kidhome'] if 'Kidhome' in df.columns else 0) + (df['Teenhome'] if 'Teenhome' in df.columns else 0) > 0).astype(int)
df['LogTotalSpent'] = np.log1p(df['TotalSpent'])
if 'Income' in df.columns:
    df['LogIncome'] = np.log1p(df['Income'].clip(lower=0))
features = [c for c in df.columns if c != 'Response']
X = df[features].copy()
y = df['Response'].astype(int).copy()
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = SimpleImputer(strategy='constant', fill_value='Unknown')
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')
X_prep = pd.DataFrame(preprocessor.fit_transform(X))
num_names = numeric_features
cat_names = categorical_features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
if len(cat_names) > 0:
    ohe.fit(X[cat_names].astype(str))
    ohe_cols = list(ohe.get_feature_names_out(cat_names))
else:
    ohe_cols = []
X_num = pd.DataFrame(SimpleImputer(strategy='median').fit_transform(X[num_names]), columns=num_names)
if len(cat_names) > 0:
    X_cat = pd.DataFrame(ohe.transform(X[cat_names].astype(str)), columns=ohe_cols)
else:
    X_cat = pd.DataFrame(index=X.index)
X_proc = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, stratify=y, random_state=42)
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
scaler = StandardScaler()
X_train_res = pd.DataFrame(scaler.fit_transform(X_train_res), columns=X_train_res.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
rf = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1)
pipeline = ImbPipeline([('model', rf)])
param_dist = {
    'model__n_estimators':[100,200,400],
    'model__max_depth':[None,8,16,24],
    'model__min_samples_split':[2,5,10],
    'model__min_samples_leaf':[1,2,4],
    'model__max_features':['sqrt','log2',0.2,0.5]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rs = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, scoring='roc_auc', cv=cv, n_jobs=-1, random_state=42, verbose=1)
rs.fit(X_train_res, y_train_res)
best = rs.best_estimator_
y_pred = best.predict(X_test_scaled)
if hasattr(best, "predict_proba"):
    y_proba = best.predict_proba(X_test_scaled)[:,1]
else:
    y_proba = best.decision_function(X_test_scaled)
report = classification_report(y_test, y_pred, output_dict=False)
roc_auc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)
os.makedirs("eda", exist_ok=True)
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.savefig("eda/confusion_matrix.png")
plt.close()
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.savefig("eda/roc_curve.png")
plt.close()
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
plt.figure()
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.4f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="lower left")
plt.savefig("eda/pr_curve.png")
plt.close()
fi = None
try:
    model_obj = best.named_steps['model']
    importances = model_obj.feature_importances_
    feat_names = X_train_res.columns
    fi = pd.Series(importances, index=feat_names).sort_values(ascending=False)
    fi.head(30).to_csv("eda/feature_importances.csv")
except Exception:
    pass
try:
    explainer = shap.TreeExplainer(model_obj)
    shap_values = explainer.shap_values(X_test_scaled)
    shap.summary_plot(shap_values, X_test_scaled, show=False)
    plt.tight_layout()
    plt.savefig("eda/shap_summary.png")
    plt.close()
except Exception:
    pass
joblib.dump({'pipeline':best, 'scaler':scaler, 'ohe':ohe, 'num_features':num_names, 'cat_features':cat_names, 'feature_columns': X_proc.columns.tolist()}, "best_pipeline.pkl")
with open("report.txt", "w") as f:
    f.write("Classification Report\n")
    f.write(report + "\n\n")
    f.write(f"ROC AUC: {roc_auc}\n")
    if fi is not None:
        f.write("\nTop feature importances:\n")
        f.write(fi.head(30).to_string())
print("ROC AUC:", roc_auc)
print("PR AUC:", pr_auc)
print(report)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
ROC AUC: 0.8832643656540957
PR AUC: 0.5502326534908203
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       343
           1       0.60      0.48      0.54        62

    accuracy                           0.87       405
   macro avg       0.75      0.71      0.73       405
weighted avg       0.86      0.87      0.87       405

