In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [4]:
raw_data =pd.read_csv("bank-full.csv", delimiter=';')
raw_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 4.1+ MB


In [7]:
raw_data.shape

(45211, 17)

In [8]:
raw_data['job'].value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

In [9]:
raw_data['marital'].value_counts()

married     27214
single      12790
divorced     5207
Name: marital, dtype: int64

In [10]:
raw_data['education'].value_counts()

secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64

In [11]:
raw_data['default'].value_counts()

no     44396
yes      815
Name: default, dtype: int64

In [12]:
raw_data['contact'].value_counts()

cellular     29285
unknown      13020
telephone     2906
Name: contact, dtype: int64

In [13]:
raw_data['day'].value_counts()

20    2752
18    2308
21    2026
17    1939
6     1932
5     1910
14    1848
8     1842
28    1830
7     1817
19    1757
29    1745
15    1703
12    1603
13    1585
30    1566
9     1561
11    1479
4     1445
16    1415
2     1293
27    1121
3     1079
26    1035
23     939
22     905
25     840
31     643
10     524
24     447
1      322
Name: day, dtype: int64

In [14]:
raw_data['campaign'].value_counts()

1     17544
2     12505
3      5521
4      3522
5      1764
6      1291
7       735
8       540
9       327
10      266
11      201
12      155
13      133
14       93
15       84
16       79
17       69
18       51
19       44
20       43
21       35
22       23
25       22
23       22
24       20
28       16
29       16
26       13
31       12
27       10
32        9
30        8
33        6
34        5
35        4
36        4
38        3
43        3
37        2
50        2
41        2
51        1
58        1
39        1
55        1
44        1
46        1
63        1
Name: campaign, dtype: int64

In [15]:
raw_data['pdays'].value_counts()

-1      36954
 182      167
 92       147
 91       126
 183      126
        ...  
 465        1
 529        1
 18         1
 434        1
 831        1
Name: pdays, Length: 559, dtype: int64

In [16]:
raw_data['previous'].value_counts()

0      36954
1       2772
2       2106
3       1142
4        714
5        459
6        277
7        205
8        129
9         92
10        67
11        65
12        44
13        38
15        20
14        19
17        15
16        13
19        11
20         8
23         8
18         6
22         6
27         5
24         5
21         4
29         4
25         4
30         3
26         2
37         2
28         2
38         2
40         1
275        1
51         1
55         1
35         1
32         1
58         1
41         1
Name: previous, dtype: int64

In [17]:
raw_data['poutcome'].value_counts()

unknown    36959
failure     4901
other       1840
success     1511
Name: poutcome, dtype: int64

In [18]:
raw_data['balance'].value_counts()

 0        3514
 1         195
 2         156
 4         139
 3         134
          ... 
 16869       1
-467         1
 8649        1
 4527        1
 3662        1
Name: balance, Length: 7168, dtype: int64

In [19]:
raw_data['balance'].describe()

count     45211.000000
mean       1362.272058
std        3044.765829
min       -8019.000000
25%          72.000000
50%         448.000000
75%        1428.000000
max      102127.000000
Name: balance, dtype: float64

In [20]:
raw_data['balance']=pd.cut(raw_data['balance'],bins=[-3313,0,10000,20000,30000,40000,50000,58544],labels=[0,1,2,3,4,5,6])
raw_data['balance']

0        1
1        1
2        1
3        1
4        1
        ..
45206    1
45207    1
45208    1
45209    1
45210    1
Name: balance, Length: 45211, dtype: category
Categories (7, int64): [0 < 1 < 2 < 3 < 4 < 5 < 6]

In [21]:
raw_data['balance'].value_counts()

1    37102
0     7275
2      636
3      141
4       24
5       10
6        8
Name: balance, dtype: int64

In [22]:
raw_data['duration'].describe()

count    45211.000000
mean       258.163080
std        257.527812
min          0.000000
25%        103.000000
50%        180.000000
75%        319.000000
max       4918.000000
Name: duration, dtype: float64

In [23]:
raw_data['duration']=pd.cut(raw_data['duration'],bins=[1,500,1000,1500,2000,2500,3000,3500],labels=[0,1,2,3,4,5,6])
raw_data['duration']

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    0
45208    2
45209    1
45210    0
Name: duration, Length: 45211, dtype: category
Categories (7, int64): [0 < 1 < 2 < 3 < 4 < 5 < 6]

In [24]:
raw_data['duration'].value_counts()

0    39835
1     4313
2      831
3      168
4       36
6       11
5        9
Name: duration, dtype: int64

In [25]:
raw_data.isnull().sum()

age           0
job           0
marital       0
education     0
default       0
balance      15
housing       0
loan          0
contact       0
day           0
month         0
duration      8
campaign      0
pdays         0
previous      0
poutcome      0
y             0
dtype: int64

In [26]:
raw_data=raw_data.drop(columns=['campaign','pdays','previous','poutcome','contact'],axis=1)

In [27]:
raw_data=raw_data.dropna(axis=0)

In [28]:
raw_data=raw_data[['y','age','job','marital','education','default','balance','housing','loan','day','month','duration']]

In [29]:
raw_data

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,day,month,duration
0,no,58,management,married,tertiary,no,1,yes,no,5,may,0
1,no,44,technician,single,secondary,no,1,yes,no,5,may,0
2,no,33,entrepreneur,married,secondary,no,1,yes,yes,5,may,0
3,no,47,blue-collar,married,unknown,no,1,yes,no,5,may,0
4,no,33,unknown,single,unknown,no,1,no,no,5,may,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45206,yes,51,technician,married,tertiary,no,1,no,no,17,nov,1
45207,yes,71,retired,divorced,primary,no,1,no,no,17,nov,0
45208,yes,72,retired,married,secondary,no,1,no,no,17,nov,2
45209,no,57,blue-collar,married,secondary,no,1,no,no,17,nov,1


In [30]:
bank=pd.get_dummies(raw_data,columns=['job','marital','education','default','balance','housing','loan','day','month','duration'])

In [32]:
label_encoder = preprocessing.LabelEncoder()
bank['y']=label_encoder.fit_transform(bank['y'])
bank

Unnamed: 0,y,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,...,month_nov,month_oct,month_sep,duration_0,duration_1,duration_2,duration_3,duration_4,duration_5,duration_6
0,0,58,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,44,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,33,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,47,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,33,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,1,51,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
45207,1,71,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
45208,1,72,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
45209,0,57,0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [33]:
X=bank.iloc[:,1:]
Y=bank.iloc[:,0]

In [34]:
X

Unnamed: 0,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,month_nov,month_oct,month_sep,duration_0,duration_1,duration_2,duration_3,duration_4,duration_5,duration_6
0,58,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,44,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,33,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,47,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,33,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
45207,71,0,0,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
45208,72,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
45209,57,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [35]:
Y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45188, dtype: int32

In [36]:
Y.value_counts()

0    39902
1     5286
Name: y, dtype: int64

# Model Building

In [39]:
model=LogisticRegression()
model.fit(X,Y)

In [40]:
model.intercept_

array([-0.03676557])

In [41]:
model.coef_

array([[ 9.94393444e-04,  1.72711505e-01, -2.19097488e-01,
        -3.51541113e-01, -3.97568871e-01, -3.55341259e-02,
         4.20592611e-01, -1.65630564e-01, -2.06623059e-02,
         6.74687537e-01, -1.37475947e-02,  5.37583104e-02,
        -1.50854322e-01,  4.70109447e-03, -1.61727258e-01,
         1.24139744e-01, -2.44777730e-01, -5.81315721e-03,
         2.41605237e-01, -2.39007700e-02,  5.60405910e-02,
        -8.89270111e-02, -3.65417336e-01,  1.22188689e-01,
         2.22826289e-01,  4.02375019e-03, -4.04347187e-02,
        -5.53357980e-03,  2.94604865e-02,  3.52011126e-01,
        -3.84897546e-01,  2.30907606e-01, -2.63794026e-01,
         3.13151953e-01, -4.87025163e-02,  1.45779942e-01,
         2.06565204e-01, -1.98101025e-01, -2.41795160e-01,
        -2.15613910e-01,  4.36022474e-02, -6.64242869e-02,
         7.12313417e-01,  1.08154001e-01,  4.86001742e-01,
         6.36046095e-01,  1.88139289e-01,  3.61698773e-01,
        -1.13571370e-01, -7.06616313e-01, -9.46854676e-0

In [42]:
Y_pred=model.predict(X)

In [43]:
Y_pred_df=pd.DataFrame({"Actaual_Y":Y,"Predicted_Y":Y_pred})
Y_pred_df

Unnamed: 0,Actaual_Y,Predicted_Y
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
45206,1,0
45207,1,0
45208,1,1
45209,0,0


In [44]:
confusion_matrix = confusion_matrix(Y,Y_pred)

In [45]:
confusion_matrix

array([[38866,  1036],
       [ 4048,  1238]], dtype=int64)

In [50]:
accuracy_score(Y,Y_pred)

0.8874922545808622

In [51]:
print(classification_report(Y,Y_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94     39902
           1       0.54      0.23      0.33      5286

    accuracy                           0.89     45188
   macro avg       0.73      0.60      0.63     45188
weighted avg       0.86      0.89      0.87     45188

