Importing required packages and libraries

In [2]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

### TODO Recording:

- Please note whatever version of scikit-learn you have here
- This is the version of scikit-learn used to create the model
- When we deploy to a container we need to install the same version of scikit-learn (specified in the requirements.txt file)

In [13]:
import sklearn

sklearn.__version__

'1.3.2'

Dataset is loaded. Link for the dataset-https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers?select=churn.csv

In [3]:
churn_df = pd.read_csv('churn.csv')

churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,Yes,Yes,101348.88,Yes
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,No,Yes,112542.58,No
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,Yes,No,113931.57,Yes
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,No,No,93826.63,No
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,Yes,Yes,79084.1,No


In [4]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  object 
 11  IsActiveMember   10000 non-null  object 
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  object 
dtypes: float64(2), int64(6), object(6)
memory usage: 1.1+ MB


In [5]:
X = churn_df.drop(columns = ['Exited','RowNumber', 'CustomerId', 'Surname'])

y = churn_df['Exited']

In [6]:
label = 'Exited'

Here we are defining our numeric and categorical features

In [7]:
categorical_features = [
    element for element in X.select_dtypes(include = ['object']).columns 
    if element not in label
]

numeric_features = [
    element for element in X.select_dtypes(include = ['float64', 'int64']).columns 
    if element not in label
]

print(f"Categorical features: {categorical_features}")
print(f"Numeric features: {numeric_features}")

Categorical features: ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
Numeric features: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']


Data is split and dimensions are checked

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

X_train.shape, X_test.shape

((8000, 10), (2000, 10))

Model pipeline is defined with data preprocessing steps. Missing values imputer is defined in case dataset is having null values

In [9]:
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False, drop = 'if_binary'))
])

transformers = [
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
]
    
preprocessor = ColumnTransformer(transformers)
    
model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier',RandomForestClassifier())
    ])

Model is trained

In [10]:
model_pipeline.fit(X_train, y_train)         

In [11]:
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

predictions =  model_pipeline.predict(X_test)

test_accuracy_score = accuracy_score(y_test, predictions)
test_precision_score = precision_score(y_test, predictions, pos_label = 'Yes')
test_recall_score = recall_score(y_test, predictions, pos_label = 'Yes')
test_f1_score = f1_score(y_test, predictions, pos_label = 'Yes')

print(f'Accuracy: {test_accuracy_score:.3f}')
print(f'Precision: {test_precision_score:.3f}')
print(f'Recall: {test_recall_score:.3f}')
print(f'F1 Score: {test_f1_score:.3f}')

report = classification_report(y_test, predictions)

print(report)

Accuracy: 0.862
Precision: 0.780
Recall: 0.464
F1 Score: 0.582
              precision    recall  f1-score   support

          No       0.87      0.97      0.92      1586
         Yes       0.78      0.46      0.58       414

    accuracy                           0.86      2000
   macro avg       0.83      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



Model is pickled 

In [12]:
import pickle

model_filename = 'churn_pred_model.pkl'

with open(model_filename, 'wb') as model_file:
    pickle.dump(model_pipeline, model_file)