## Model Building and Comparison

### Overview
In this file, we focused on building machine learning models for predicting bank customer churn. The steps included:

1. **Pipeline Construction**
   - Constructed a scikit-learn pipeline to streamline the model building process.

2. **Model Building**
   - Implemented Logistic Regression, Random Forest, and Gradient Boosting models.

3. **Model Comparison**
   - Compared the performance of these models using appropriate evaluation metrics.

4. **Hyperparameter Tuning**
   - Applied hyperparameter tuning techniques to optimize the performance of the best model.



In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score

In [58]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [59]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

## Custom Preprocessor

In [60]:
def feature_engineering(x):
    x_copy = x.copy()
    
    x_copy['AgeGroup'] = x_copy['Age'] // 10 * 10
    x_copy['Balance_per_Product'] = x_copy['Balance'] / x_copy['NumOfProducts']
    x_copy['CreditScore_Balance_Interaction'] = x_copy['CreditScore'] * x_copy['Balance']
    x_copy['Age_Tenure_Interaction'] = x_copy['Age'] * x_copy['Tenure']
    x_copy['Balance_to_Salary_Ratio'] = x_copy['Balance'] / x_copy['EstimatedSalary']
    x_copy['Geo_Gender'] = x_copy['Geography'] + "_" + x_copy['Gender']
    x_copy['Average_Product_Holding_Duration'] = x_copy['Tenure'] / x_copy['NumOfProducts']
    x_copy['Products_Per_Tenure'] = x_copy['NumOfProducts'] / (x_copy['Tenure'] + 0.0001)

    x_copy.drop(['HasCrCard', 'Tenure', 'Gender'], axis=1, inplace=True)
    return x_copy

FeatureEngineering = FunctionTransformer(feature_engineering)

## ColumnTransformer

In [61]:
# Creating a column transformer for preprocessing

numerical_columns = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 'Balance_per_Product', 'CreditScore_Balance_Interaction',
                     'Age_Tenure_Interaction', 'Balance_to_Salary_Ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(), ['Geography', 'Geo_Gender'])
    ], 
    remainder='passthrough'
)

## Pipeline

In [62]:
pipeline = Pipeline([
    ('featur_eng', FeatureEngineering),
    ('preprocessor', preprocessor)
])

In [63]:
X = df.drop(columns='Exited')
y = df['Exited']

In [64]:
X_transformed = pipeline.fit_transform(X, y)

## Balancing Data: SMOTE

In [65]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_transformed, y)

In [66]:
X_resampled.shape

(15926, 15)

In [48]:
models = {
    "log_reg": LogisticRegression(max_iter=10000),
    "random_forest": RandomForestClassifier(),
    "gradient_boosting": GradientBoostingClassifier()
}

### Training LR, RF, GB

In [49]:
res = []
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    res.append([model_name, scores.mean(), scores.std(), test_accuracy])

In [50]:
model_summary = pd.DataFrame(res, columns=['model', 'accuracy', 'accuracy_std', 'test_accuracy'])
model_summary

Unnamed: 0,model,accuracy,accuracy_std,test_accuracy
0,log_reg,0.711381,0.005636,0.70339
1,random_forest,0.90471,0.002068,0.909918
2,gradient_boosting,0.874019,0.006562,0.877276


## Hyperparameter Tuning

In [86]:
rf_cv_params = {'n_estimators' : [int(x) for x in np.linspace(50, 150, 10)],
                'max_depth':[2, 7, 12, 15, 18, 20, None],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5]}

scores = {'accuracy': 'accuracy' , 'precision':'precision', 'recall' : 'recall', 'f1':'f1'}

In [87]:
# Instantiate random forest model.
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
rf = RandomForestClassifier()

kfold = KFold(n_splits=3, shuffle=True, random_state=42)
rf_val = GridSearchCV(rf, rf_cv_params, cv=kfold, scoring = scores ,  refit='f1', n_jobs = -1, verbose=3)

# Fit the model
rf_val.fit(X_train , y_train)

Fitting 3 folds for each of 420 candidates, totalling 1260 fits


In [89]:
#getting the best paramaters to be used with Random forest
rf_val.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

In [92]:
#getting the best paramaters to be used with Random forest
rf_val.best_estimator_

In [91]:
y_pred = rf_val.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred)

0.9077212806026366

***Note: 90.77% accuracy on balanced data***

In [94]:
model_pipeline = Pipeline([
    ('pipeline', pipeline),
    ('model', rf_val.best_estimator_)
])

In [95]:
model_pipeline

In [98]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

X = df.drop(columns='Exited')
y = df['Exited']

y_pred = model_pipeline.predict(X)
accuracy_score(y, y_pred)

0.9745

## Exporting

In [99]:
import pickle

In [100]:
# Save the model_pipeline to a file
with open('data/model_pipeline.pkl', 'wb') as file:
    pickle.dump(model_pipeline, file)