In [7]:
import pandas as pd
import numpy as np
import zipfile

In [4]:
data = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

In [16]:
!wget $data

--2024-10-14 21:27:33--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [            <=>     ] 999.85K   173KB/s    in 6.2s    

2024-10-14 21:27:41 (160 KB/s) - ‘bank+marketing.zip’ saved [1023843]



In [17]:
!unzip 'bank+marketing.zip'

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [18]:
!unzip bank.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [21]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [23]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [24]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


## Data Preparation

In [25]:
# format columns

In [26]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [27]:
df.iloc[:15]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [31]:
selected_columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y']

In [40]:
df = df[selected_columns]

In [41]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [42]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [43]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [51]:
numerical = list(df.dtypes[df.dtypes == 'int64'].index)

In [52]:
df[numerical]

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [53]:
df[['age', 'balance']].corr()

Unnamed: 0,age,balance
age,1.0,0.097783
balance,0.097783,1.0


In [54]:
df[['day', 'campaign']].corr()

Unnamed: 0,day,campaign
day,1.0,0.16249
campaign,0.16249,1.0


In [55]:
df[['day', 'pdays']].corr()

Unnamed: 0,day,pdays
day,1.0,-0.093044
pdays,-0.093044,1.0


In [56]:
df[['pdays', 'previous']].corr()

Unnamed: 0,pdays,previous
pdays,1.0,0.45482
previous,0.45482,1.0


In [57]:
## Target encoding

In [59]:
df.y.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [60]:
df.y == 'yes'

0        False
1        False
2        False
3        False
4        False
         ...  
45206     True
45207     True
45208     True
45209    False
45210    False
Name: y, Length: 45211, dtype: bool

In [61]:
(df.y == 'yes').astype(int).head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [62]:
df.y = (df.y == 'yes').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.y = (df.y == 'yes').astype(int)


In [63]:
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [64]:
## Split the data

In [65]:
from sklearn.model_selection import train_test_split

In [67]:
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,0


In [66]:
train_test_split(df, test_size=0.2, random_state=1)

[       age          job   marital  education  balance housing   contact  day  \
 22468   52   management   married   tertiary        0      no  cellular   22   
 6896    50       admin.    single  secondary      330     yes   unknown   28   
 28408   45  blue-collar   married    primary      300     yes  cellular   29   
 8481    38   technician   married  secondary       27     yes   unknown    3   
 28753   31     services    single  secondary      887      no  cellular   30   
 ...    ...          ...       ...        ...      ...     ...       ...  ...   
 43723   50   technician   married   tertiary     1830      no  cellular   14   
 32511   34   management  divorced   tertiary      528     yes  cellular   17   
 5192    48   management   married   tertiary     4191      no   unknown   21   
 12172   45       admin.   married    unknown     1783      no   unknown   20   
 33003   45  blue-collar   married    unknown      349     yes  cellular   17   
 
       month  duration  ca

In [75]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [76]:
df_full_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3344,41,blue-collar,married,primary,849,yes,unknown,15,may,72,1,-1,0,unknown,0
17965,49,technician,married,primary,1415,yes,cellular,30,jul,269,2,-1,0,unknown,0
18299,42,admin.,married,secondary,3842,no,cellular,31,jul,130,4,-1,0,unknown,0
10221,37,management,single,tertiary,-119,yes,unknown,11,jun,375,11,-1,0,unknown,0
32192,56,blue-collar,married,primary,3498,no,cellular,15,apr,264,2,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,44,housemaid,single,primary,1059,no,unknown,18,jun,2093,1,-1,0,unknown,1
44732,23,student,single,tertiary,508,no,cellular,8,sep,210,1,92,1,failure,0
38158,34,technician,divorced,tertiary,1317,yes,cellular,15,may,239,1,-1,0,unknown,0
860,33,retired,married,secondary,165,no,unknown,7,may,111,1,-1,0,unknown,0


In [80]:
df_test

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3776,40,blue-collar,married,secondary,580,yes,unknown,16,may,192,1,-1,0,unknown,0
9928,47,services,single,secondary,3644,no,unknown,9,jun,83,2,-1,0,unknown,0
33409,25,student,single,tertiary,538,yes,cellular,20,apr,226,1,-1,0,unknown,0
31885,42,management,married,tertiary,1773,no,cellular,9,apr,311,1,336,1,failure,0
15738,56,management,married,tertiary,217,no,cellular,21,jul,121,2,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13353,47,management,married,tertiary,1890,no,cellular,8,jul,161,1,-1,0,unknown,0
38732,32,blue-collar,single,secondary,217,yes,cellular,15,may,692,3,-1,0,unknown,1
5654,52,admin.,divorced,secondary,0,yes,unknown,26,may,206,1,-1,0,unknown,0
3779,40,admin.,divorced,secondary,783,yes,unknown,16,may,171,2,-1,0,unknown,0


In [81]:
len(df_full_train), len(df_test)

(36168, 9043)

In [82]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [83]:
len(df), len(df_train), len(df_val), len(df_test)

(45211, 27126, 9042, 9043)

In [87]:
del df_train['y']

In [89]:
del df_val['y']
del df_test['y']

In [90]:
## Mutual information

In [91]:
from sklearn.metrics import mutual_info_score

In [102]:
list(df_train.columns)

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [116]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
dtype: object

In [130]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)

In [131]:
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [135]:
score = df_full_train[categorical].apply(mutual_info_churn_score)

In [138]:
score.round(2)

job          0.01
marital      0.00
education    0.00
housing      0.01
contact      0.01
month        0.02
poutcome     0.03
dtype: float64

# Logistic Regression

In [140]:
# one hot encoding

In [141]:
from sklearn.feature_extraction import DictVectorizer