In [None]:
# --- Imports ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib

# --- Load processed data ---
df = pd.read_csv('../data/processed/retail_customers_cleaned.csv')

# --- Feature Engineering Example ---
df['TotalSpendPerTenure'] = df['TotalSpend'] / (df['Tenure'] + 1)

# --- Train-Test Split ---
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Model Training ---
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

rf.fit(X_train, y_train)
gb.fit(X_train, y_train)

# --- Evaluate ---
rf_preds = rf.predict(X_test)
gb_preds = gb.predict(X_test)

rf_acc = accuracy_score(y_test, rf_preds)
gb_acc = accuracy_score(y_test, gb_preds)

print(f"RandomForest Accuracy: {rf_acc:.3f}")
print(f"GradientBoosting Accuracy: {gb_acc:.3f}")

# --- Choose Best Model ---
best_model = rf if rf_acc > gb_acc else gb
print(f"Selected Model: {type(best_model).__name__}")

# --- Save the model ---
joblib.dump(best_model, '../model/churn_model.pkl')
