In [4]:
import pandas as pd

df_ml = pd.read_excel("../data/processed/encoded/marketing_campaign_ml_ready.xlsx")
df_ml.head()


Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,Family_Status_Single_2,Family_Status_Single_3,Family_Status_Together_0,Family_Status_Together_1,Family_Status_Together_2,Family_Status_Together_3,Family_Status_Widow_0,Family_Status_Widow_1,Family_Status_Widow_2,Family_Status_YOLO_1
0,5524,0.621359,0.084832,0.0,0.0,0.585859,0.425318,0.442211,0.316522,0.664093,...,0,0,0,0,0,0,0,0,0,0
1,2174,0.592233,0.067095,0.5,0.5,0.383838,0.007368,0.005025,0.003478,0.007722,...,1,0,0,0,0,0,0,0,0,0
2,4141,0.699029,0.105097,0.0,0.0,0.262626,0.285332,0.246231,0.073623,0.428571,...,0,0,1,0,0,0,0,0,0,0
3,6182,0.883495,0.037471,0.5,0.0,0.262626,0.007368,0.020101,0.011594,0.03861,...,0,0,0,1,0,0,0,0,0,0
4,5324,0.854369,0.085065,0.5,0.0,0.949495,0.115874,0.21608,0.068406,0.177606,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_ml.columns

Index(['ID', 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency',
       'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
       'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Z_CostContact',
       'Z_Revenue', 'Response', 'Age', 'Total_Kids', 'Total_Spent',
       'Total_Purchases', 'Avg_Spent_per_Purchase', 'Customer_Tenure_Days',
       'Total_Campaign_Accepted', 'High_Spender', 'MntWines_Ratio',
       'MntFruits_Ratio', 'MntMeatProducts_Ratio', 'MntFishProducts_Ratio',
       'MntSweetProducts_Ratio', 'MntGoldProds_Ratio', 'Education_Basic',
       'Education_Graduation', 'Education_Master', 'Education_PhD',
       'Marital_Status_Alone', 'Marital_Status_Divorced',
       'Marital_Status_Married', 'Marital_Status_Single',
       'Marital_Status_Together', '

In [6]:
# Step 2: Separate features (X) and target (y)

# Target variable (change if you want to predict something else)
y = df_ml["Response"]

# Drop ID and target from features
X = df_ml.drop(columns=["ID", "Response"])


In [7]:
from sklearn.model_selection import train_test_split

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      
    random_state=42,   
    stratify=y          
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (1792, 71)
Testing set size: (448, 71)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 4: Train a baseline model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluation
print("✅ Baseline Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Baseline Model Accuracy: 0.9620535714285714

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       381
           1       1.00      0.75      0.85        67

    accuracy                           0.96       448
   macro avg       0.98      0.87      0.92       448
weighted avg       0.96      0.96      0.96       448


Confusion Matrix:
 [[381   0]
 [ 17  50]]


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 5: Train a Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=200,        
    max_depth=None,         
    min_samples_split=2,     
    min_samples_leaf=1,      
    random_state=42,
    n_jobs=-1                
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("✅ Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


✅ Random Forest Accuracy: 0.9375

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       381
           1       0.87      0.69      0.77        67

    accuracy                           0.94       448
   macro avg       0.91      0.83      0.87       448
weighted avg       0.94      0.94      0.93       448


Confusion Matrix:
 [[374   7]
 [ 21  46]]


In [10]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

# Step 4: Apply SMOTE on the training set only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 5: Set up parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],   
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],   
    'min_samples_leaf': [1, 2, 4],     
    'class_weight': ['balanced']       
}

# Step 6: Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Step 7: Grid Search with cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1',       
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_resampled, y_train_resampled)

# Step 8: Best model
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Step 9: Predictions
y_pred = best_rf.predict(X_test)

# Step 10: Evaluation
print("✅ Tuned Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))




Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
✅ Tuned Random Forest Accuracy: 0.953125

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       381
           1       0.83      0.87      0.85        67

    accuracy                           0.95       448
   macro avg       0.90      0.92      0.91       448
weighted avg       0.95      0.95      0.95       448


Confusion Matrix:
 [[369  12]
 [  9  58]]


In [11]:
from sklearn.ensemble import RandomForestClassifier


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 6),
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', 'balanced_subsample']
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,          
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", random_search.best_params_)
best_rf = random_search.best_estimator_


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 223}


In [13]:
# Use the best estimator from RandomizedSearchCV
best_rf = random_search.best_estimator_

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9732142857142857

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98       381
           1       0.86      0.99      0.92        67

    accuracy                           0.97       448
   macro avg       0.93      0.98      0.95       448
weighted avg       0.98      0.97      0.97       448


Confusion Matrix:
 [[370  11]
 [  1  66]]


In [14]:
import joblib
joblib.dump(best_rf, 'best_model.joblib')
print("Model saved as 'best_model.joblib'")
features = X.columns.tolist() 
joblib.dump(features, 'model_features.joblib')
print("Feature list saved as 'model_features.joblib'")
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 
joblib.dump(scaler, 'scaler.joblib')
print("Scaler saved as 'scaler.joblib'")

Model saved as 'best_model.joblib'
Feature list saved as 'model_features.joblib'
Scaler saved as 'scaler.joblib'
