In [344]:
import pandas as pd
from zipfile import ZipFile
import numpy as np

### Dataset

In this homework, we will use the Bank Marketing dataset. Download it from [here](https://archive.ics.uci.edu/static/public/222/bank+marketing.zip).

Or you can do it with `wget`:

```bash
wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
```

We need to take `bank/bank-full.csv` file from the downloaded zip-file.  
In this dataset our desired target for classification task will be `y` variable - has the client subscribed a term deposit or not. 

In [345]:
data = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

In [346]:
!wget $data

--2024-10-14 11:24:45--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [            <=>     ] 999.85K   215KB/s    in 4.8s    

2024-10-14 11:24:51 (208 KB/s) - ‘bank+marketing.zip’ saved [1023843]



In [347]:
zf = ZipFile('bank+marketing.zip', 'r')
zf.extractall('bank+marketing')
zf.close()

In [352]:
zf = ZipFile('bank+marketing/bank.zip', 'r')
zf.extractall('bank')
zf.close()

In [433]:
import pandas as pd
import numpy as np

In [434]:
df = pd.read_csv("bank/bank-full.csv", sep = ";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Features

For the rest of the homework, you'll need to use only these columns:

* `age`,
* `job`,
* `marital`,
* `education`,
* `balance`,
* `housing`,
* `contact`,
* `day`,
* `month`,
* `duration`,
* `campaign`,
* `pdays`,
* `previous`,
* `poutcome`,
* `y`

### Data preparation

* Select only the features from above.
* Check if the missing values are presented in the features.

In [435]:
features = ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign',
            'pdays','previous','poutcome','y']
df_new = df[features].copy()

In [436]:
df_new.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [437]:
(df_new=="-").sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [438]:
df_new.columns = df_new.columns.str.lower().str.replace(' ','_')

In [439]:
df_new.dtypes[df_new.dtypes=='object'].index

Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome', 'y'],
      dtype='object')

In [440]:
categorical_columns = list(df_new.dtypes[df_new.dtypes=='object'].index)

for c in categorical_columns:
    df_new[c]=df_new[c].str.lower().str.replace(' ','_')
categorical_columns.remove('y')

In [441]:
df_new.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


### Question 1

What is the most frequent observation (mode) for the column `education`?

- `unknown`
- `primary`
- `secondary`
- `tertiary`


In [442]:
print(f"The most most frequent observation (mode) for the column 'education' is '{df['education'].mode()[0]}'.")

The most most frequent observation (mode) for the column 'education' is 'secondary'.


### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `age` and `balance`
- `day` and `campaign`
- `day` and `pdays`
- `pdays` and `previous`

In [443]:
categorical_columns

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [444]:
numerical_columns = list(set(features)-set(categorical_columns))
numerical_columns.remove('y')

In [445]:
corr_matrix = df_new[numerical_columns].corr()
corr_matrix

Unnamed: 0,pdays,age,campaign,balance,previous,duration,day
pdays,1.0,-0.023758,-0.088628,0.003435,0.45482,-0.001565,-0.093044
age,-0.023758,1.0,0.00476,0.097783,0.001288,-0.004648,-0.00912
campaign,-0.088628,0.00476,1.0,-0.014578,-0.032855,-0.08457,0.16249
balance,0.003435,0.097783,-0.014578,1.0,0.016674,0.02156,0.004503
previous,0.45482,0.001288,-0.032855,0.016674,1.0,0.001203,-0.05171
duration,-0.001565,-0.004648,-0.08457,0.02156,0.001203,1.0,-0.030206
day,-0.093044,-0.00912,0.16249,0.004503,-0.05171,-0.030206,1.0


In [446]:
corr_matrix['age']['balance'], corr_matrix['day']['campaign'], corr_matrix['day']['pdays'], corr_matrix['pdays']['previous']

(0.09778273937134807,
 0.16249021632619218,
 -0.0930440737729405,
 0.4548196354805043)

In [447]:
s = corr_matrix.unstack()
s.sort_values(kind='quicksort')

day       pdays      -0.093044
pdays     day        -0.093044
campaign  pdays      -0.088628
pdays     campaign   -0.088628
campaign  duration   -0.084570
duration  campaign   -0.084570
previous  day        -0.051710
day       previous   -0.051710
previous  campaign   -0.032855
campaign  previous   -0.032855
duration  day        -0.030206
day       duration   -0.030206
pdays     age        -0.023758
age       pdays      -0.023758
balance   campaign   -0.014578
campaign  balance    -0.014578
day       age        -0.009120
age       day        -0.009120
          duration   -0.004648
duration  age        -0.004648
pdays     duration   -0.001565
duration  pdays      -0.001565
previous  duration    0.001203
duration  previous    0.001203
age       previous    0.001288
previous  age         0.001288
balance   pdays       0.003435
pdays     balance     0.003435
balance   day         0.004503
day       balance     0.004503
campaign  age         0.004760
age       campaign    0.004760
balance 

In [448]:
print(f"The features 'pdays' and 'previous' have the biggest correlation: {corr_matrix['pdays']['previous']}")

The features 'pdays' and 'previous' have the biggest correlation: 0.4548196354805043


### Target encoding

* Now we want to encode the `y` variable.
* Let's replace the values `yes`/`no` with `1`/`0`.

In [449]:
df_new.y = (df_new.y=='yes').astype(int)
df_new.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [450]:
from sklearn.model_selection import train_test_split

In [451]:
df_full_train, df_test = train_test_split(df_new, test_size=0.2, random_state=42)

In [452]:
len(df_full_train), len(df_test)

(36168, 9043)

In [453]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [454]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [455]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [456]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [457]:
del df_train['y']
del df_val['y']
del df_test['y']

In [458]:
df_full_train.reset_index(drop=True, inplace=True)

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `contact`
- `education`
- `housing`
- `poutcome`


In [459]:
from sklearn.metrics import mutual_info_score

In [460]:
def mutual_info_y_scores(series):
    return mutual_info_score(series,df_full_train.y)

In [461]:
mi = df_full_train[categorical_columns].apply(mutual_info_y_scores)
mi.sort_values(ascending=False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

In [462]:
print(f"The variable 'poutcome' has the biggest mutual information score of {mi.poutcome}.")

The variable 'poutcome' has the biggest mutual information score of 0.02925655626647966.


### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [463]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [464]:
dv = DictVectorizer(sparse = False)

In [465]:
dicts_train = df_train.to_dict(orient='records')

In [466]:
X_train = dv.fit_transform(dicts_train)

In [467]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [468]:
dicts_val = df_val.to_dict(orient='records')

In [469]:
X_val = dv.transform(dicts_val)

In [470]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.01384133, 0.00991416, 0.15855571, ..., 0.05156528, 0.0091288 ,
       0.27121895])

In [471]:
subcription_decision = (y_pred>=0.5).astype(int)

In [472]:
original_accuracy = (subcription_decision == y_val).mean()

In [473]:
original_accuracy

0.9010174740101747

In [474]:
print(f"The accuracy of the model is {round(original_accuracy,2)}")

The accuracy of the model is 0.9


### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.

In [475]:
from decimal import Decimal
original_accuracy_decimal = Decimal(original_accuracy)

In [493]:
accuracy_diff_list = []
columns = ['age', 'balance', 'marital', 'previous']
for c in columns:
    df_train_drop = df_train.copy()
    df_train_drop = df_train_drop.drop(c,axis=1)
    dicts_train_drop = df_train_drop.to_dict(orient = 'records')
    dv_drop = DictVectorizer(sparse=False)
    X_train_drop = dv_drop.fit_transform(dicts_train_drop)

    model_drop = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_drop.fit(X_train_drop,y_train)

    df_val_drop = df_val.copy()
    df_val_drop = df_val_drop.drop(c, axis = 1)
    dicts_val_drop = df_val_drop.to_dict(orient = 'records')
    X_val_drop = dv_drop.transform(dicts_val_drop)

    y_pred_drop = model_drop.predict_proba(X_val_drop)[:,1]
    subscription_decision_drop = (y_pred_drop>=0.5).astype(int)

    accuracy_drop = (subscription_decision_drop==y_val).mean()
    print(accuracy_drop)
    accuracy_diff_list.append(abs(Decimal(original_accuracy)-Decimal(accuracy_drop)))

0.9007962840079629
0.9007962840079629
0.9011280690112807
0.900575094005751


In [494]:
accuracy_diff_list

[Decimal('0.0002211900022118795661185686185'),
 Decimal('0.0002211900022118795661185686185'),
 Decimal('0.0001105950011059952942105155671'),
 Decimal('0.0004423800044237591322371372371')]

In [495]:
print(f"The feature with the smallest difference is {columns[2]} with difference of {accuracy_diff_list[2]}")

The feature with the smallest difference is marital with difference of 0.0001105950011059952942105155671


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

In [503]:
C = [0.01, 0.1, 1, 10, 100]

dv_reg = DictVectorizer(sparse=False)

df_train_reg = df_train.copy()
dicts_train_reg = df_train_reg.to_dict(orient = 'records')
X_train_reg = dv_reg.fit_transform(dicts_train_reg)

df_val_reg = df_val.copy()
dicts_val_reg = df_val_reg.to_dict(orient = 'records')
X_val_reg = dv_reg.transform(dicts_val_reg)

accuracy_reg_list=[]
for c in C:
    model_reg = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model_reg.fit(X_train_reg,y_train)

    y_pred_reg = model_reg.predict_proba(X_val_reg)[:,1]
    subscription_decision_reg = (y_pred_reg>0.5).astype(int)

    accuracy_reg = (subscription_decision_reg==y_val).mean()
    accuracy_reg_list.append(round(accuracy_reg,3))

In [504]:
accuracy_reg_list

[0.898, 0.901, 0.901, 0.901, 0.901]

In [502]:
max(accuracy_reg_list)

0.901

In [508]:
print(f"C = {C[1]} is the smallest c which leads to the best accuracy on the validation set")

C = 0.1 is the smallest c which leads to the best accuracy on the validation set
