In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
print("Imported")

Imported


In [84]:
bank_df = pd.read_csv('bank/bank-full.csv', sep=';')
print(bank_df.head(3))
print(f"Number of rows: {len(bank_df)}") 

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
Number of rows: 45211


In [93]:
correlations = x.corrwith(y)
print(correlations.sort_values(ascending=False))

duration             0.394521
pdays                0.103621
previous             0.093236
education_encoded    0.066241
balance              0.052838
marital_encoded      0.045588
job_encoded          0.040438
age                  0.025155
default_encoded     -0.022419
month_encoded       -0.024471
day                 -0.028348
loan_encoded        -0.068185
campaign            -0.073172
poutcome_encoded    -0.077840
housing_encoded     -0.139173
contact_encoded     -0.148395
dtype: float64


In [86]:
print(bank_df.columns.tolist())

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [87]:
#check if any columns have null values
missing_counts = bank_df.isnull().sum()
print(missing_counts[missing_counts > 0])
print(bank_df.groupby('poutcome').count())

Series([], dtype: int64)
            age    job  marital  education  default  balance  housing   loan  \
poutcome                                                                       
failure    4901   4901     4901       4901     4901     4901     4901   4901   
other      1840   1840     1840       1840     1840     1840     1840   1840   
success    1511   1511     1511       1511     1511     1511     1511   1511   
unknown   36959  36959    36959      36959    36959    36959    36959  36959   

          contact    day  month  duration  campaign  pdays  previous      y  
poutcome                                                                     
failure      4901   4901   4901      4901      4901   4901      4901   4901  
other        1840   1840   1840      1840      1840   1840      1840   1840  
success      1511   1511   1511      1511      1511   1511      1511   1511  
unknown     36959  36959  36959     36959     36959  36959     36959  36959  


In [88]:
#drop duration feature
bank_df.drop('duration', axis=1)
bank_df.info

<bound method DataFrame.info of        age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married    unknown      no     1506     yes   no   
4       33       unknown    single    unknown      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
45206   51    technician   married   tertiary      no      825      no   no   
45207   71       retired  divorced    primary      no     1729      no   no   
45208   72       retired   married  secondary      no     5715      no   no   
45209   57   blue-collar   married  secondary      no      668      no   no   
45210   37  entrepreneur   married  secondary      no     2971      no   no   

         contact  d

In [89]:
#hot encode values
#binary columns
for col in ['default', 'housing', 'loan', 'y']:
    bank_df[col + "_encoded"] = bank_df[col].map({'yes': 1, 'no': 0})

#categorical columns
for col in ['job', 'marital', 'education', 'contact', 'month', 'poutcome']:
    encoder = LabelEncoder()
    bank_df[col + "_encoded"] = encoder.fit_transform(bank_df[col])
#print(bank_df['job'].unique())
#print(bank_df['job_encoded'].unique())

In [94]:
#drop predictor column and have it as a separate df
x = bank_df.drop(['default', 'housing', 'loan', 'y', 'job', 'marital', 'education', 'contact', 'month', 'poutcome', 'y_encoded'], axis=1)
y = bank_df['y_encoded']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)
print(f"Number of rows in x_train: {len(x_train)}, Number of rows in x_test: {len(x_test)}")

Number of rows in x_train: 31647, Number of rows in x_test: 13564


In [95]:
print(x_train.info)

<bound method DataFrame.info of        age  balance  day  duration  campaign  pdays  previous  \
42867   53     6571    3       208         1     -1         0   
27173   50      231   21       397         2     -1         0   
8875    39      518    4       608         2     -1         0   
18330   42      634   31       451         2     -1         0   
8306    58     1604    2       358         3     -1         0   
...    ...      ...  ...       ...       ...    ...       ...   
5520    54     -630   23       173         1     -1         0   
35814   47      603    8      1080         1     -1         0   
20463   58     1463   12       110         3     -1         0   
18638   26        1   31        14        20     -1         0   
35683   59     6237    8        63         1    169         2   

       default_encoded  housing_encoded  loan_encoded  job_encoded  \
42867                0                0             0            4   
27173                0                1        

In [97]:
#creating the random forest model
random_forest_model = RandomForestClassifier(random_state=5)
random_forest_model.fit(x_train, y_train)
y_pred = random_forest_model.predict(x_test)
print("Done creating model")

Done creating model


In [98]:
#checking accuracy of random forest model
accuracy_rf = accuracy_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred)
recall_rf = recall_score(y_test, y_pred)
f1_rf = f1_score(y_test, y_pred)
cm_rf = confusion_matrix(y_test, y_pred)
print(accuracy)
print(precision)
print(f1)
print(cm)

0.9068858743733412
0.6690140845070423
0.5129193983802546
[[11636   329]
 [  934   665]]


In [100]:
#creating the logistic regression model
log_model = LogisticRegression(random_state=5,max_iter=10000)
log_model.fit(x_train, y_train)
y_pred = log_model.predict(x_test)
print("Done creating model")

Done creating model


In [101]:
#checking accuracy of logistic regression model
accuracy_log = accuracy_score(y_test, y_pred)
precision_log = precision_score(y_test, y_pred)
recall_log = recall_score(y_test, y_pred)
f1_log = f1_score(y_test, y_pred)
cm_log = confusion_matrix(y_test, y_pred)
print(accuracy)
print(precision)
print(f1)
print(cm)

0.8925833087584784
0.6334586466165414
0.3162834350070389
[[11770   195]
 [ 1262   337]]
