In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

#### Load the data

In [32]:
df = pd.read_csv('bank-full.csv',sep = ';')

In [33]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [34]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [35]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [36]:
# Select only the desired columns
selected_columns = [
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y'
]

In [37]:
df_selected = df[selected_columns]

In [38]:
df_selected.loc[:, 'y'] = (df_selected['y'] == 'yes').astype(int)

In [39]:
df_full_train, df_test = train_test_split(df_selected, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [40]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [41]:
y_train = df_train['y'].values
y_val = df_val['y'].values
y_test = df_test['y'].values

In [42]:
y_train , y_val , y_test

(array([0, 0, 0, ..., 0, 0, 0], dtype=object),
 array([0, 0, 0, ..., 0, 0, 0], dtype=object),
 array([0, 0, 0, ..., 0, 0, 1], dtype=object))

In [43]:
y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)

In [44]:
y_train , y_val , y_test

(array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 1]))

In [45]:
del df_train['y']
del df_val['y']
del df_test['y']

In [46]:
numerical_columns = list(df_train.dtypes[df_train.dtypes == 'int64'].index)
categorical_columns = list(df_train.dtypes[df_train.dtypes == 'object'].index)

In [47]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [48]:
val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

### Question 1

In [49]:
numerical_columns

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [50]:
from sklearn.metrics import roc_auc_score
auc_scores = {}

for col in numerical_columns:
    auc = roc_auc_score(y_train, df_train[col])
    
    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[col])
    
    auc_scores[col] = auc

In [51]:
# Determine the variable with the highest AUC
highest_auc_col = max(auc_scores, key=auc_scores.get)
highest_auc_value = auc_scores[highest_auc_col]

# Print the AUC scores for each variable and the one with the highest AUC
print("AUC scores:", auc_scores)
print(f"Numerical variable with the highest AUC: {highest_auc_col} (AUC = {highest_auc_value:.4f})")


AUC scores: {'age': 0.512185717527344, 'balance': 0.5888313805382317, 'day': 0.525957882383908, 'duration': 0.8147002759670778, 'campaign': 0.5714543015682159, 'pdays': 0.5901276247352144, 'previous': 0.5985653242764153}
Numerical variable with the highest AUC: duration (AUC = 0.8147)


In [52]:
# The following code is just for testing purposes

from sklearn.metrics import roc_curve, auc

auc_scores_2 = {}
for col in numerical_columns:
    feature_values = df_train[col].values

    # Calculate the ROC curve
    fpr, tpr, thresholds = roc_curve(y_train, feature_values)

    # Calculate the AUC
    calculated_auc = auc(fpr, tpr)

    if calculated_auc < 0.5:
        # Invert feature values if AUC is less than 0.5
        fpr, tpr, thresholds = roc_curve(y_train, -feature_values)
        calculated_auc = auc(fpr, tpr)

    auc_scores_2[col] = calculated_auc

print("AUC scores:", auc_scores)


AUC scores: {'age': 0.512185717527344, 'balance': 0.5888313805382317, 'day': 0.525957882383908, 'duration': 0.8147002759670778, 'campaign': 0.5714543015682159, 'pdays': 0.5901276247352144, 'previous': 0.5985653242764153}


### Question 2

Apply one-hot-encoding using DictVectorizer and train the logistic regression with these parameters:\
LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)\
What's the AUC of this model on the validation dataset? (round to 3 digits)

In [53]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

In [54]:
y_pred_val_prob = model.predict_proba(X_val)[:, 1] # Probability of the positive class
auc = roc_auc_score(y_val, y_pred_val_prob)
round(auc, 3)

0.901

### Question 3

Now let's compute precision and recall for our model.\
Evaluate the model on all thresholds from 0.0 to 1.0 with step 0.01\
For each threshold, compute precision and recall\
Plot them and At which threshold precision and recall curves intersect?

In [55]:
from sklearn.metrics import precision_score, recall_score

In [59]:
precisons = []
recalls = []
thresholds = np.arange(0.0, 1.01, 0.01)

for threshold in thresholds:
    y_pred_val = (y_pred_val_prob >= threshold).astype(int)
    precisons.append(precision_score(y_val, y_pred_val , zero_division= 0))
    recalls.append(recall_score(y_val, y_pred_val , zero_division= 0))
    
# Plot precision and recall
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precisions, label='Precision', color='blue')
plt.plot(thresholds, recalls, label='Recall', color='orange')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs Threshold')
plt.legend()
plt.grid()
plt.axhline(0, color='black', lw=0.5, ls='--')
plt.axvline(0, color='black', lw=0.5, ls='--')
plt.show()

NameError: name 'precisions' is not defined

<Figure size 1000x600 with 0 Axes>