In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

## Load Analytical Base Table

In [6]:
df=pd.read_csv('../data/processed_data.csv',usecols=lambda column:not column.startswith('Unnamed'))
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,614,Spain,Female,40,3,113348.5,1,1,1,77789.01,0
1,758,France,Female,34,1,154139.45,1,1,1,60728.89,0
2,541,Germany,Female,39,9,100116.67,1,1,1,199808.1,1
3,481,France,NotSpecified,37,8,152303.66,2,1,1,175082.2,0
4,711,France,Female,37,8,113899.92,1,0,0,80215.2,0


#### Separate dataframe into separate object

In [17]:
x=df.drop('Exited',axis=1)
y=df['Exited']
print(x.shape,y.shape)

(10000, 10) (10000,)


In [12]:
num_col=x.select_dtypes(include='number').columns.to_list()
num_col

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [14]:
cat_col=x.select_dtypes(include='object').columns.to_list()
cat_col

['Geography', 'Gender']

In [19]:
def perc_target(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    df2=pd.DataFrame(np.array(kv).T,columns=['Exited','Count'])
    df2['Count']=df2['Count'].astype('int64')
    df2['%']=round(df2['Count']/a.shape*100,2)
    return df2.sort_values('Count',ascending=False)

In [21]:
perc_target(y)

Unnamed: 0,Exited,Count,%
0,0,7963,79.63
1,1,2037,20.37


### Create a Train Test Split

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)
print(len(x_train),len(x_test),len(y_train),len(y_test))

7000 3000 7000 3000


In [26]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 5663 to 1289
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


## Pre-processing Pipeline
#### Scale numerical data and encode categorical data
#### Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder
#### Create lists of indexes from the list of column names
#### Need to be numeric not string to specify columns name in column transformer

In [34]:
num_feature=[]
for i in num_col:
    loc=x.columns.get_loc(i)
    num_feature.append(loc)

num_feature

[0, 3, 4, 5, 6, 7, 8, 9]

In [36]:
cat_feature=[]
for i in cat_col:
    loc=x.columns.get_loc(i)
    cat_feature.append(loc)

cat_feature

[1, 2]

In [40]:
preprocessor=make_column_transformer(
    (MinMaxScaler(),num_feature),
    (OneHotEncoder(sparse_output=False),cat_feature)
)
preprocessor

## Build Model Pipeline without SMOTE
##### To see the impact of SMOTE to our results, wew will first build our pipeline without SMOTE

In [43]:
from sklearn.svm import SVC
model=make_pipeline(
    preprocessor,
    SVC(random_state=10)
)
model

In [45]:
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__kernel' : ['linear', 'rbf', 'poly', 'sigmoid'],
              'svc__C': [0.0005,0.001, 0.01, 0.1, 0.5],
              'svc__gamma': [5, 1, 0.1, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3, cv= 5, n_jobs=4)

In [47]:
grid.fit(x_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [49]:
print(grid.best_params_)

{'svc__C': 0.5, 'svc__gamma': 5, 'svc__kernel': 'poly'}


In [51]:
print(grid.best_score_)

0.8522857142857143


In [53]:
print(f'Training Data Score : {grid.score(x_train,y_train)}')
print(f'Testing Data Score : {grid.score(x_test,y_test)}')

Training Data Score : 0.8802857142857143
Testing Data Score : 0.8443333333333334


In [55]:
prediction=grid.predict(x_test)
prediction

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)

In [57]:
confusion_matrix(y_test,prediction)

array([[2243,  111],
       [ 356,  290]], dtype=int64)

In [59]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.86      0.95      0.91      2354
           1       0.72      0.45      0.55       646

    accuracy                           0.84      3000
   macro avg       0.79      0.70      0.73      3000
weighted avg       0.83      0.84      0.83      3000



## Build Model Pipeline with SMOTE
##### Let's check now the impact of SMOTE to our results
##### We are going to use the Pipeline from the imblearn package in place of scikit-learn Pipeline.
##### It takes care automatically to re-sample when called fit() on the pipeline, and does not re-sample test data (when called transform() or predict()).

In [69]:
model_smote=make_pipeline(
    preprocessor,
    SMOTE(random_state=10),
    SVC(random_state=10)
)
model_smote

In [73]:
grid_smote=GridSearchCV(
    model_smote,
    param_grid,
    cv=5,
    verbose=3,
    n_jobs=4
)

In [75]:
grid_smote.fit(x_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [77]:
print(grid_smote.best_params_)

{'svc__C': 0.1, 'svc__gamma': 5, 'svc__kernel': 'sigmoid'}


In [79]:
print(grid_smote.best_score_)

0.7849999999999999


In [81]:
print(f'Training Data Score : {grid_smote.score(x_train,y_train)}')
print(f'Testing Data Score : {grid_smote.score(x_test,y_test)}')

Training Data Score : 0.7852857142857143
Testing Data Score : 0.7726666666666666


In [85]:
prediction_smote=grid_smote.predict(x_test)
prediction_smote

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [87]:
confusion_matrix(y_test,prediction_smote)

array([[2255,   99],
       [ 583,   63]], dtype=int64)

In [89]:
print(classification_report(y_test,prediction_smote))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87      2354
           1       0.39      0.10      0.16       646

    accuracy                           0.77      3000
   macro avg       0.59      0.53      0.51      3000
weighted avg       0.71      0.77      0.72      3000



## Save the Models

In [92]:
import joblib

filename_nos='../Models/svm_nos.sav'
joblib.dump(grid,filename_nos)

filename_smote='../Models/svm_smote.sav'
joblib.dump(grid_smote,filename_smote)

['../Models/svm_smote.sav']

## Loading the Model

In [95]:
model_nos=joblib.load(filename_nos)
print(model_nos.score(x_test,y_test))

model_smote=joblib.load(filename_smote)
print(model_smote.score(x_test,y_test))

0.8443333333333334
0.7726666666666666
