In [195]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [131]:
df=pd.read_csv('../data/processed_data.csv',usecols=lambda column:not column.startswith('Unnamed'))
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,614,Spain,Female,40,3,113348.5,1,1,1,77789.01,0
1,758,France,Female,34,1,154139.45,1,1,1,60728.89,0
2,541,Germany,Female,39,9,100116.67,1,1,1,199808.1,1
3,481,France,NotSpecified,37,8,152303.66,2,1,1,175082.2,0
4,711,France,Female,37,8,113899.92,1,0,0,80215.2,0


## Models Training
### Let's start by splitting our dataframe into separate objects:
### y for the target varibale
### X for the input features

In [134]:
x=df.drop('Exited',axis=1)
y=df['Exited']

In [136]:
print(x.shape,y.shape)

(10000, 10) (10000,)


In [138]:
num_col=x.select_dtypes(include='number').columns.to_list()
num_col

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [140]:
cat_col=x.select_dtypes(include='object').columns.to_list()
cat_col

['Geography', 'Gender']

In [142]:
def perc_target(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    df2=pd.DataFrame(np.array(kv).T,columns=['Exited','Count'])
    df2['Count']=df2['Count'].astype('int64')
    df2['%']=round(df2['Count']/a.shape*100,2)
    return df2.sort_values('Count',ascending=False)

In [144]:
perc_target(y)

Unnamed: 0,Exited,Count,%
0,0,7963,79.63
1,1,2037,20.37


## Create a Train Test Split
### We will continue with splitting our data into separate training and test sets.
### 30% of observations will be set aside for the test set
### the rest, 70%, will be used as the training set

In [147]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)

In [149]:
print(len(x_train),len(x_test),len(y_train),len(y_test))

7000 3000 7000 3000


In [151]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 5663 to 1289
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


## Pre-processing Pipeline
### Scale numerical data and encode categorical data
#### Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder Create lists of indexes from the list of column names Need to be numeric not string to specify columns name in column transformer

In [156]:
num_feature=[]
for i in num_col:
    index=x.columns.get_loc(i)
    num_feature.append(index)

In [158]:
num_feature

[0, 3, 4, 5, 6, 7, 8, 9]

In [160]:
cat_feature=[]
for i in cat_col:
    index=x.columns.get_loc(i)
    cat_feature.append(index)

In [162]:
cat_feature

[1, 2]

In [168]:
preprocess=make_column_transformer(
    (MinMaxScaler(),num_feature),
    (OneHotEncoder(sparse_output=False),cat_feature)
)
preprocess

In [170]:
from sklearn.linear_model import LogisticRegression

In [172]:
model=make_pipeline(
    preprocess,
    SMOTE(random_state=10),
    LogisticRegression(random_state=10)

)

In [174]:
model

In [207]:
parameters={
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C' : [0.01, 0.05, 0.1, 0.5, 1, 5],
    'logisticregression__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
}

In [178]:
from sklearn.model_selection import GridSearchCV

In [209]:
grid_model=GridSearchCV(model,parameters,cv=5,scoring='accuracy',verbose=3)

In [211]:
grid_model.fit(x_train,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticregression__solver=liblinear;, score=0.656 total time=   0.0s
[CV 2/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticregression__solver=liblinear;, score=0.674 total time=   0.0s
[CV 3/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticregression__solver=liblinear;, score=0.650 total time=   0.0s
[CV 4/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticregression__solver=liblinear;, score=0.694 total time=   0.0s
[CV 5/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticregression__solver=liblinear;, score=0.638 total time=   0.0s
[CV 1/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticregression__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END logisticregression__C=0.01, logisticregression__penalty=l1, logisticr

In [212]:
print(grid_model.best_params_)

{'logisticregression__C': 5, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'saga'}


In [215]:
print(grid_model.best_score_)

0.7118571428571429


In [221]:
print(f"Training Data Score: {grid_model.score(x_train, y_train)}")
print(f"Testing Data Score: {grid_model.score(x_test, y_test)}")

Training Data Score: 0.7121428571428572
Testing Data Score: 0.707


In [225]:
prediction=grid_model.predict(x_test)

In [231]:
pd.DataFrame({'Actual':y_test,'Predicted':prediction}).reset_index(drop=True)

Unnamed: 0,Actual,Predicted
0,1,0
1,0,0
2,0,0
3,0,0
4,0,1
...,...,...
2995,0,0
2996,0,0
2997,0,1
2998,1,1


In [237]:
print(confusion_matrix(y_test,prediction))

[[1667  687]
 [ 192  454]]


In [239]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.90      0.71      0.79      2354
           1       0.40      0.70      0.51       646

    accuracy                           0.71      3000
   macro avg       0.65      0.71      0.65      3000
weighted avg       0.79      0.71      0.73      3000



In [241]:
import joblib
filename='../Models/LogisticRegression.sav'
joblib.dump(grid_model,filename)

['../Models/LogisticRegression.sav']

In [243]:
model_lib=joblib.load(filename)
print(model_lib.score(x_test,y_test))

0.707
