# Homework #3

**Set up the environment**

In [153]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

from sklearn.linear_model import LogisticRegression

**Dataset**

In [156]:
import urllib.request

url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
filename = "bank+marketing.zip"
urllib.request.urlretrieve(url, filename)

('bank+marketing.zip', <http.client.HTTPMessage at 0x25922b97b90>)

In [158]:
df = pd.read_csv("C:/Users/MONSTER/machine-learning-zoomcamp-2024/bank+marketing/bank/bank-full.csv", sep = ";")

In [160]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


**Data Preparation**

In [164]:
# Select the features
base = ["age","job","marital","education","balance","housing","contact","day","month","duration","campaign","pdays","previous","poutcome","y"]
df_new = df[base]

# Check if the missing values are presented in the features
df_new.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [166]:
df_new.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

**Question 1** <br>
What is the most frequent observation (mode) for the column education?

In [169]:
mode_education = df['education'].mode()[0]
print("Most frequent observation for education:", mode_education)

Most frequent observation for education: secondary


**Question 2** <br>
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compete the correlation coeffiecient between every pair of features. <br>
What are two features that have the biggest correlation?

In [202]:
# Selecting only numerical columns
numerical_features = df_new.select_dtypes(include=["int64"])

# Creating the correlation matrix
correlation_matrix = numerical_features.corr()

# Display the correlation matrix
print(correlation_matrix)

# Find the two features with the highest correlation (ignoring self-correlation)
correlation_pairs = correlation_matrix.unstack().sort_values(kind = "quicksort", ascending = False)

# Filtering out self-correlation (correlation of a feature with itself)
high_corr = correlation_pairs[correlation_pairs < 1].idxmax()

print("Two features with the highest correlation:", high_corr)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
Two features with the highest correlation: ('previous', 'pdays')


**Target Encoding**

In [175]:
df_new["y"]

0         no
1         no
2         no
3         no
4         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 45211, dtype: object

In [177]:
# Encode y variable
# Replace 'yes' with 1 and 'no' with 0 in the target column 'y'
df_new.loc[:, "y"] = df_new["y"].replace({"yes": 1, "no": 0})
df_new["y"]

  df_new.loc[:, "y"] = df_new["y"].replace({"yes": 1, "no": 0})


0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: object

**Split the data**

In [180]:
# Separate the features (X) and the target (y)
X = df_new.drop(columns=['y'])
y = df_new['y']

# First split: 60% train, 40% (to be split further into val and test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split: 50% of the temp set goes to val and 50% to test (results in 20% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

Training set: 27126 samples
Validation set: 9042 samples
Test set: 9043 samples


**Question 3** <br>
* Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?
* contact
* education
* housing
* poutcome

In [183]:
# Define the categorical variables of interest
categorical = X_train.select_dtypes(include=['object']).columns

# Create a function to calculate mutual information
def calculate_mi(series):
    return mutual_info_score(series, y_train)  # Use y_train as the target

# Apply the function to the specified categorical variables
mi_scores = X_train[categorical].apply(calculate_mi)

# Convert the series to a DataFrame and round the scores to 2 decimals
mi_df = mi_scores.to_frame(name='MI Score').round(2)

# Sort the mutual information scores in descending order
mi_df = mi_df.sort_values(by='MI Score', ascending=False)

# Display the results
display(mi_df)

# Get the variable with the highest mutual information score
highest_mi_variable = mi_df.index[0]  # Get the index of the first row (variable name)
highest_mi_score = mi_df['MI Score'].max()  # Get the highest score

# Print the final output in the desired format
print(f"The variable with the highest mutual information score is: {highest_mi_variable} ({highest_mi_score})")

Unnamed: 0,MI Score
poutcome,0.03
month,0.02
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


The variable with the highest mutual information score is: poutcome (0.03)


**Question 4** <br>
* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.


In [258]:
# Split the data into features (X) and target (y)
X = df_new.drop(columns=['y'])
y = df_new['y']  # Target variable

# Replace 'yes' and 'no' in target variable with 1 and 0
y = y.replace({'yes': 1, 'no': 0})

  y = y.replace({'yes': 1, 'no': 0})


In [260]:
# One hot encoding
X_train_dict = X.to_dict(orient='records')
dv = DictVectorizer(sparse = False) # Initialize DictVectorizer
X_train_dict[0]

{'age': 58,
 'job': 'management',
 'marital': 'married',
 'education': 'tertiary',
 'balance': 2143,
 'housing': 'yes',
 'contact': 'unknown',
 'day': 5,
 'month': 'may',
 'duration': 261,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'poutcome': 'unknown'}

In [262]:
# Fit and transform the training data using DictVectorizer
X_encoded = dv.fit_transform(X_train_dict)
X_encoded

array([[5.800e+01, 2.143e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.400e+01, 2.900e+01, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.300e+01, 2.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [7.200e+01, 5.715e+03, 5.000e+00, ..., 1.000e+00, 0.000e+00,
        3.000e+00],
       [5.700e+01, 6.680e+02, 4.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.700e+01, 2.971e+03, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        1.100e+01]])

In [266]:
# Split the data into train and validation sets (60% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Fit the model on the training dataset
model.fit(X_train, y_train)

# Predict on the validation dataset
y_val_pred = model.predict(X_val)

# Calculate accuracy on the validation dataset
accuracy = accuracy_score(y_val, y_val_pred)

# Round the accuracy to 2 decimal digits
rounded_accuracy = round(accuracy, 2)

# Print the accuracy
print(f"Accuracy on the validation dataset: {rounded_accuracy}")

Accuracy on the validation dataset: 0.9


**Question 5** <br>
* Let's find the least useful feature using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?
* age
* balance
* marital
* previous



In [269]:
## Same as in Q4
# Split the data into features (X) and target (y)
X = df_new.drop(columns=['y'])
y = df_new['y']  # Target variable

# Replace 'yes' and 'no' in target variable with 1 and 0
y = y.replace({'yes': 1, 'no': 0})

# One hot encoding
X_train_dict = X.to_dict(orient='records')
dv = DictVectorizer(sparse = False) # Initialize DictVectorizer

# Fit and transform the training data using DictVectorizer
X_encoded = dv.fit_transform(X_train_dict)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Train the model with all features
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)

  y = y.replace({'yes': 1, 'no': 0})


In [275]:
# Calculate the original accuracy
original_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Original accuracy: {original_accuracy}")

# List of specific features to check
features_to_check = ['age', 'balance', 'marital', 'previous']

# List to store accuracy differences
accuracy_differences = []

# Iterate over each feature in the specific list
for feature in features_to_check:
    
    if feature in dv.feature_names_:
        # Drop the specific feature
        X_train_dropped = np.delete(X_train, dv.feature_names_.index(feature), axis=1)
        X_val_dropped = np.delete(X_val, dv.feature_names_.index(feature), axis=1)
    
        # Train model without the feature
        model.fit(X_train_dropped, y_train)
    
        # Predict and calculate accuracy without the feature
        y_val_pred_dropped = model.predict(X_val_dropped)
        accuracy_dropped = accuracy_score(y_val, y_val_pred_dropped)
    
        # Calculate the difference in accuracy
        accuracy_diff = original_accuracy - accuracy_dropped
        accuracy_differences.append((feature, accuracy_diff))

# Sort the features by accuracy difference in ascending order
accuracy_differences.sort(key=lambda x: x[1])

# Display all specified features and their accuracy differences
for feature, diff in accuracy_differences:
    print(f"Feature: {feature}, Accuracy Difference: {diff:.4f}")

# Display the feature with the smallest accuracy difference
print(f"\nThe feature with the smallest accuracy difference: {accuracy_differences[0][0]}")

Original accuracy: 0.8992590954329316
Feature: balance, Accuracy Difference: -0.0001
Feature: age, Accuracy Difference: 0.0000
Feature: previous, Accuracy Difference: 0.0003

The feature with the smallest accuracy difference: balance


**Question 6** <br>
* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?
* 0.01
* 0.1
* 1
* 10
* 100

In [282]:
## Same as in Q4
# Split the data into features (X) and target (y)
X = df_new.drop(columns=['y'])
y = df_new['y']  # Target variable

# Replace 'yes' and 'no' in target variable with 1 and 0
y = y.replace({'yes': 1, 'no': 0})

# One hot encoding
X_train_dict = X.to_dict(orient='records')
dv = DictVectorizer(sparse = False) # Initialize DictVectorizer

# Fit and transform the training data using DictVectorizer
X_encoded = dv.fit_transform(X_train_dict)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Train the model with all features
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)

  y = y.replace({'yes': 1, 'no': 0})


In [284]:
# List of C values to test
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store the accuracy for each value of C
accuracy_dict = {}

# Train a model for each value of C and calculate accuracy
for C in C_values:
    # Create the logistic regression model with the current C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Fit the model on the training set
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate the accuracy
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Store the accuracy in the dictionary
    accuracy_dict[C] = round(accuracy, 3)
    
    print(f"C = {C}: Validation Accuracy = {accuracy:.3f}")

# Find the C value with the highest accuracy
best_C = max(accuracy_dict, key=accuracy_dict.get)
best_accuracy = accuracy_dict[best_C]

# Display the best C value and its accuracy
print(f"\nThe best C value is {best_C}, with an accuracy of {best_accuracy:.3f}")

C = 0.01: Validation Accuracy = 0.898
C = 0.1: Validation Accuracy = 0.899
C = 1: Validation Accuracy = 0.899
C = 10: Validation Accuracy = 0.899
C = 100: Validation Accuracy = 0.899

The best C value is 0.1, with an accuracy of 0.899
