In [18]:
import pandas as pd 
import numpy as np

In [19]:
df = pd.read_csv('bank-full.csv')

In [20]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [7]:
del df['default']

### Question 1

In [25]:
df.education.mode()

0    secondary
Name: education, dtype: object

### Question 2

In [27]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [28]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [29]:
df[numerical_columns].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


### Question 3

In [30]:
df.y = (df.y == 'yes').astype(int)

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [34]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)

In [35]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [36]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [37]:
del df_train['y']
del df_val['y']
del df_test['y']

In [38]:
df_full_train = df_full_train.reset_index(drop=True)

In [47]:
categorical_columns = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [40]:
from sklearn.metrics import mutual_info_score

In [41]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.y)

In [48]:
mi = df_full_train[categorical_columns].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
loan         0.002509
education    0.002458
marital      0.002019
dtype: float64

### Question 4

In [49]:
from sklearn.feature_extraction import DictVectorizer

In [51]:
train_dicts = df_train[categorical_columns +  numerical_columns].to_dict(orient='records')

In [52]:
dv = DictVectorizer(sparse=False)

In [53]:
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

In [54]:
val_dicts = df_val[categorical_columns +  numerical_columns].to_dict(orient='records')

In [55]:
X_val = dv.transform(val_dicts)

In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [58]:
model.fit(X_train, y_train)

In [64]:
y_pred = model.predict(X_val)

In [62]:
from sklearn.metrics import accuracy_score

In [66]:
round(accuracy_score(y_val, y_pred), 2)


0.9

### Question 5

In [75]:
cols = categorical_columns + numerical_columns
accuracy = {}
col = 1
while col < len(cols):
    remaining_cols = cols[col : len(cols)]
    train_dicts = df_train[remaining_cols].to_dict(orient='records')
    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)
    val_dicts = df_val[remaining_cols].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy[cols[col]] = round(accuracy_score(y_val, y_pred), 2)
    col+=1
accuracy
    

{'marital': 0.9,
 'education': 0.9,
 'housing': 0.9,
 'loan': 0.9,
 'contact': 0.9,
 'month': 0.9,
 'poutcome': 0.9,
 'age': 0.89,
 'balance': 0.89,
 'day': 0.89,
 'duration': 0.89,
 'campaign': 0.88,
 'pdays': 0.88,
 'previous': 0.88}

### Question 6

In [77]:
C = [0.01, 0.1, 1, 10, 100]
accuracy = {}
for c in C:
    train_dicts = df_train[categorical_columns +  numerical_columns].to_dict(orient='records')
    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy[c] = round(accuracy_score(y_val, y_pred), 3)

accuracy
    

{0.01: 0.882, 0.1: 0.882, 1: 0.882, 10: 0.882, 100: 0.882}