In [71]:
import pandas as pd
import numpy as np

In [72]:
df = pd.read_csv("data/bank-full.csv", delimiter=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [73]:
req_cols = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 
            'pdays', 'previous', 'poutcome', 'y']

df = df[req_cols]

### DATA PREPARATION

In [74]:
# Check for missing values in the columns
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

- No missing values in the data

### QUESTION 1`

In [75]:
df['education'].mode()

0    secondary
Name: education, dtype: object

In [76]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

### QUESTION 2

In [77]:
from pandas.api.types import is_numeric_dtype

numerical_cols = [col for col in df.columns.tolist() if is_numeric_dtype(df[col])]
cat_cols = list(set(df.columns) - set(numerical_cols))

In [78]:
corr_df = df[numerical_cols].corr()

In [79]:
print(f"correlation between age and balance: {corr_df.loc['age', 'balance']}")
print(f"correlation between day and campaign: {corr_df.loc['day', 'campaign']}")
print(f"correlation between day and pdays: {corr_df.loc['day', 'pdays']}")
print(f"correlation between pdays and previous: {corr_df.loc['pdays', 'previous']}")

correlation between age and balance: 0.09778273937134807
correlation between day and campaign: 0.1624902163261922
correlation between day and pdays: -0.09304407377294048
correlation between pdays and previous: 0.4548196354805043


### TARGET ENCODING

In [80]:
# replace the values yes/no with 1/0
df['y'] = df.y.replace({"yes":1, "no":0})

In [81]:
y = df['y']

del df['y']

cat_cols.remove('y')

In [121]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=X_test.shape[0], random_state=42)

In [122]:
print(X_train.shape, X_val.shape, X_test.shape)

(27125, 14) (9043, 14) (9043, 14)


### QUESTION 3

In [123]:
from sklearn.metrics import mutual_info_score
import warnings
warnings.filterwarnings("ignore")

for col in cat_cols:
    print(f"Mutual Information Score between y and {col} is {mutual_info_score(y_train, X_train[col])}")

Mutual Information Score between y and marital is 0.002051023469964469
Mutual Information Score between y and job is 0.007316045349524453
Mutual Information Score between y and poutcome is 0.029532873038678076
Mutual Information Score between y and contact is 0.013357917272815392
Mutual Information Score between y and housing is 0.010341817487667673
Mutual Information Score between y and education is 0.002698335063014118
Mutual Information Score between y and month is 0.025090765672140206


### QUESTION 4

In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [125]:
train_dict = X_train.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

In [126]:
val_dict = X_val.to_dict(orient="records")
X_val = dv.transform(val_dict)

In [127]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [128]:
model.fit(X_train, y_train)
pred_val = model.predict(X_val)

In [129]:
from sklearn.metrics import accuracy_score

In [130]:
acc_model = accuracy_score(y_val, pred_val)
acc_model

0.9009178370009953

In [153]:
diff_lst = []
lst = []

def linear_model_accuracy(col_removed=None, C=1.0):
    
    df_copy = df.copy()
    
    if col_removed!=None:
        del df_copy[col_removed]
    
    X_train, X_test, y_train, y_test = train_test_split(df_copy, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=X_test.shape[0], random_state=42)
    
#     print(X_train.shape, X_val.shape, X_test.shape)
    
    train_dict = X_train.to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)

    val_dict = X_val.to_dict(orient="records")
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)

    diff_lst.append((col_removed, abs(acc_model - accuracy_score(y_val, pred_val))))
    lst.append((C, accuracy_score(y_val, pred_val)))

In [145]:
for col in df.columns.tolist():
    
    linear_model_accuracy(col_removed=col)

In [146]:
diff_lst

[('age', 0.00022116554240847464),
 ('job', 0.00033174831361271195),
 ('marital', 0.00011058277120423732),
 ('education', 0.00011058277120423732),
 ('balance', 0.0),
 ('housing', 0.0008846621696340096),
 ('contact', 0.00033174831361271195),
 ('day', 0.0005529138560212976),
 ('month', 0.0008846621696340096),
 ('duration', 0.011611190976445918),
 ('campaign', 0.0018799071104722564),
 ('pdays', 0.00022116554240847464),
 ('previous', 0.00011058277120423732),
 ('poutcome', 0.007851376755501516)]

In [150]:
diff_lst[np.argmin(np.array(diff_lst)[:,1])]

('balance', 0.0)

In [154]:
for c in [0.01, 0.1, 1, 10, 100]:
    linear_model_accuracy(C=c)

In [155]:
lst

[(0.01, 0.898595598805706),
 (0.1, 0.9006966714585868),
 (1, 0.9009178370009953),
 (10, 0.9016919163994249),
 (100, 0.9006966714585868)]

In [158]:
lst[np.argmax(np.array(lst)[:,1])]

(10, 0.9016919163994249)