In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('bank-full.csv', sep=';')

In [3]:
df = pd.DataFrame(data)

In [4]:
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing',
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
           'previous', 'poutcome', 'y']

In [5]:
df = df[columns]

In [6]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1

In [7]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

Secondary is the mode

### Question 2

In [8]:
numerical = list(df.select_dtypes(int).columns)
list(numerical)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [9]:
n = len(numerical)
dict = {}

for i in range(n - 1):
    var1 = numerical[i]
    for j in range(i + 1, n):
        var2 = numerical[j]
        corr = df[[var1]].corrwith(df[var2]).values[0]
        dict[f'{var1}-{var2}'] = corr 

In [10]:
pd.Series(dict).sort_values(ascending=False)

pdays-previous       0.454820
day-campaign         0.162490
age-balance          0.097783
balance-duration     0.021560
balance-previous     0.016674
age-campaign         0.004760
balance-day          0.004503
balance-pdays        0.003435
age-previous         0.001288
duration-previous    0.001203
duration-pdays      -0.001565
age-duration        -0.004648
age-day             -0.009120
balance-campaign    -0.014578
age-pdays           -0.023758
day-duration        -0.030206
campaign-previous   -0.032855
day-previous        -0.051710
duration-campaign   -0.084570
campaign-pdays      -0.088628
day-pdays           -0.093044
dtype: float64

pdays and previous have the highest correlation

### Target encoding

In [11]:
df['y'] = (df['y'] == 'yes').astype(int)

### Data splitting

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [15]:
y_train = df_train.y
y_val = df_val.y
y_test = df_test.y

In [16]:
del df_train['y']
del df_val['y']
del df_test['y']

### Question 3: Mutual information

In [17]:
categorical = list(df.select_dtypes(object))

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
def mutual_info_subscribe_score(series):
    return mutual_info_score(y_train, series)

In [20]:
mi = df_train[categorical].apply(mutual_info_subscribe_score)
mi.round(3).sort_values(ascending=False)

poutcome     0.030
month        0.025
contact      0.013
housing      0.010
job          0.007
education    0.003
marital      0.002
dtype: float64

poutcome has the highest mutual information with the target variable

### Question 4

### One-hot encoding

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)
X_test = dv.transform(dict_test)

### Logistic Regression Training

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression(solver='liblinear', C=1.0,max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_val)

In [26]:
(y_pred == y_val).mean().round(1)

0.9

The accuracy is approximately 90% on the validation dataset

### Question 5

In [27]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

In [28]:
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_val)

In [30]:
global_accuracy = (y_pred == y_val).mean()

In [31]:
exclude = ['age', 'balance', 'marital', 'previous']
v = list(df_train.columns)
ans = {}

for i in exclude:
    ls = v.copy()
    ls.remove(i)
    
    dict_train = df_train[ls].to_dict(orient='records')
    dict_val = df_val[ls].to_dict(orient='records')
    X_train = dv.fit_transform(dict_train)
    X_val = dv.fit_transform(dict_val)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = (y_pred == y_val).mean()
    ans[i] = abs(global_accuracy - accuracy)

pd.Series(ans).sort_values()

age         0.000111
balance     0.000332
previous    0.000442
marital     0.000664
dtype: float64

Age has the smallest difference

### Question 6

In [32]:
C = [0.01, 0.1, 1, 10, 100]

dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

ans = {}

X_train = dv.fit_transform(dict_train)
X_val = dv.fit_transform(dict_val)

for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    accuracy = (y_pred == y_val).mean()
    ans[c] = accuracy

pd.Series(ans).sort_values(ascending=False)

100.00    0.901460
10.00     0.900907
1.00      0.900464
0.10      0.900354
0.01      0.897810
dtype: float64

C = 100 has the the best accuracy