In [4]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv('Bank_Churn_recommendations.csv')

# Drop unnecessary columns
df = df.drop(['CustomerId', 'Surname'], axis=1, errors='ignore')

# Check for missing values
df = df.dropna()

# Separate features and target
X = df.drop('UpsellRecommendation', axis=1)
y = df['UpsellRecommendation']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object' or col in ['Geography', 'Gender']]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance using SMOTE on both sets
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
X_test_sm, y_test_sm = smote.fit_resample(X_test, y_test)

In [5]:
# Define base models
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss')
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

xgb_grid = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
rf_grid = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Create ensemble model
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_grid),
        ('rf', rf_grid)
    ],
    voting='soft'
)

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', ensemble_model)
])

# Train the model
pipeline.fit(X_train_sm, y_train_sm)

# Make predictions
y_pred = pipeline.predict(X_test_sm)

# Evaluate the model
accuracy = accuracy_score(y_test_sm, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_sm, y_pred))

print("\nClassification Report:")
print(classification_report(y_test_sm, y_pred))

Accuracy: 0.9667

Confusion Matrix:
[[716   0   0   0   0   0   0]
 [  0 685  26   0   0   5   0]
 [  0  45 610   0  50  11   0]
 [  0   0   0 716   0   0   0]
 [  0   0  29   0 687   0   0]
 [  0   1   0   0   0 715   0]
 [  0   0   0   0   0   0 716]]

Classification Report:
                                                                                          precision    recall  f1-score   support

               Cross-sell opportunity: Suggest savings or credit products with low fees.       1.00      1.00      1.00       716
Engagement incentive: Personalized offers or loyalty points for early tenure engagement.       0.94      0.96      0.95       716
                             General offer: Reward program or tailored financial review.       0.92      0.85      0.88       716
   Long-tenured customer: Recommend premium financial products or exclusive memberships.       1.00      1.00      1.00       716
  Mid-term tenure: Suggest insurance, fixed deposits, or personal loans

In [6]:
# Save the model and preprocessor
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(pipeline, 'model/upsell_ensemble_model.pkl')  
joblib.dump(preprocessor, 'model/preprocessor.pkl')

print("Model and preprocessor saved successfully!")

Model and preprocessor saved successfully!
