<a href="https://colab.research.google.com/github/spencer18001/machine-learning-zoomcamp/blob/main/03/hw_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!wget -O bank+marketing.zip wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
!unzip bank+marketing.zip && unzip bank.zip 'bank-full.csv' -d .

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
df_path = "bank-full.csv"
df = pd.read_csv(df_path, sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
df.shape

(45211, 17)

In [None]:
df = df[['age', 'job', 'marital', 'education', 'balance',
         'housing', 'contact', 'day', 'month', 'duration',
         'campaign', 'pdays', 'previous', 'poutcome', 'y']]
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [None]:
df.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
balance,int64
housing,object
contact,object
day,int64
month,object
duration,int64


In [None]:
df.isnull().sum() # no missing values

Unnamed: 0,0
age,0
job,0
marital,0
education,0
balance,0
housing,0
contact,0
day,0
month,0
duration,0


In [None]:
df.education.mode() # Q1

Unnamed: 0,education
0,secondary


In [None]:
corr_matrix = df.iloc[:, :-1].corr(numeric_only=True)
corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [None]:
corr_mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
corr_matrix_upper_tri = corr_matrix.abs().where(corr_mask)
max_corr_features = corr_matrix_upper_tri.stack().idxmax()
max_corr = corr_matrix_upper_tri.stack().max()
max_corr_features, max_corr # Q2

(('pdays', 'previous'), 0.4548196354805043)

In [None]:
df.iloc[:, -1].value_counts()

Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
no,39922
yes,5289


In [None]:
df.iloc[:, -1] = df.iloc[:, -1].map({"yes": 1, "no": 0})

In [None]:
def data_split(df, random_state):
    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=random_state)
    df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=random_state)
    return df_train, df_val, df_test

In [None]:
df_train, df_val, df_test = data_split(df, random_state=42)

In [None]:
cat_cols = df.iloc[:, :-1].select_dtypes(include=['object']).columns
mi_scores = []
for col in cat_cols:
    score = mutual_info_score(df_train[col], df_train.iloc[:, -1])
    mi_scores.append(round(score, 2))

cat_cols[np.argmax(mi_scores)], list(zip(cat_cols, mi_scores)) # Q3

('poutcome',
 [('job', 0.01),
  ('marital', 0.0),
  ('education', 0.0),
  ('housing', 0.01),
  ('contact', 0.01),
  ('month', 0.02),
  ('poutcome', 0.03)])

In [None]:
def logistic_regression_fit(df, C):
    dv = DictVectorizer(sparse=False)
    train_dict = df.iloc[:, :-1].to_dict(orient='records')
    X = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X, df.iloc[:, -1].values())
    return model

In [None]:
def accuracy(y, y_pred):
    return (y_val == y_pred).mean()

In [None]:
def logistic_regression_predict(model, df):
    dv = DictVectorizer(sparse=False)
    train_dict = df.iloc[:, :-1].to_dict(orient='records')
    X = dv.fit_transform(train_dict)

    y_pred = model.predict(X)
    return accuracy(df.iloc[:, -1].values(), y_pred)

In [None]:
model = logistic_regression_fit(df_train, C=1.0)
acc_full = logistic_regression_predict(model, df_val)
round(acc_full, 2) # Q4

In [None]:
features = df_train.iloc[:, :-1].columns
accuracies_diff = []
for col in features:
    model = logistic_regression_fit(df_train.drop([col], axis=1), C=1.0)
    acc = logistic_regression_predict(model, df_val.drop([col], axis=1))
    accuracies_diff.append(acc_full - acc)

features[np.argmin(accuracies_diff.abs())], list(zip(features, accuracies_diff)) # Q5

In [None]:
accuracies_r = []
C_list = [0.01, 0.1, 1, 10, 100]
for C in C_list:
    model = logistic_regression_fit(df_train.drop([col], axis=1), C=C)
    acc = logistic_regression_predict(model, df_val)
    accuracies_r.append(acc)

C_list[np.argmax(accuracies_r)], accuracies_r # Q6