In [6]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import zipfile

In [7]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-11 13:01:26--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: 'bank+marketing.zip.2'

bank+marketing.zip.     [    <=>             ] 999.85K  1.03MB/s    in 0.9s    

2024-10-11 13:01:28 (1.03 MB/s) - 'bank+marketing.zip.2' saved [1023843]



In [8]:
# unziping file

with zipfile.ZipFile('bank+marketing.zip.2', 'r') as zip_ref:
    zip_ref.extractall('bank-marketing')

In [9]:
# listing all files in the extracted folder to check name

extracted_files = os.listdir('bank-marketing')
print(extracted_files)

['bank.zip', 'bank-additional.zip']


In [10]:
# Unzip bank.zip
with zipfile.ZipFile('bank-marketing/bank.zip', 'r') as zip_ref:
    zip_ref.extractall('bank-marketing')

# Unzip bank-additional.zip
with zipfile.ZipFile('bank-marketing/bank-additional.zip', 'r') as zip_ref:
    zip_ref.extractall('bank-marketing')

In [11]:
# Listing all files in the directory after unzipping
extracted_files = os.listdir('bank-marketing')
print(extracted_files)

['bank-names.txt', 'bank.csv', 'bank.zip', 'bank-additional', 'bank-full.csv', '__MACOSX', 'bank-additional.zip']


In [12]:
# Loading the dataset and checking first rows

df = pd.read_csv('bank-marketing/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [15]:
# checking the data tyeps

df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [16]:
# List of columns to keep
selected_columns = [
    'age', 'job', 'marital', 'education', 'balance', 
    'housing', 'contact', 'day', 'month', 'duration', 
    'campaign', 'pdays', 'previous', 'poutcome', 'y'
]


In [17]:
# Select only the specified columns
df_selected = df[selected_columns]

In [18]:
df_selected.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1

What is the most frequent observation (mode) for the column education?

In [19]:
# Find the most frequent observation (mode) for the 'education' column
df_selected['education'].mode()[0]

'secondary'

# Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [20]:
# Select only the numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [21]:
# Compute the correlation matrix
correlation_matrix = df_selected[numerical_features].corr()

In [22]:
# Display the correlation matrix
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [23]:
# Find the pair of features with the highest correlation (excluding self-correlation)
corr_pairs = correlation_matrix.unstack().sort_values(ascending=False)

In [24]:
# Exclude self-correlation (correlation of 1)
corr_pairs = corr_pairs[corr_pairs < 1]

In [26]:
# Get the pair with the highest correlation
highest_corr = corr_pairs.idxmax()
print("The two features with the highest correlation are:", highest_corr)

The two features with the highest correlation are: ('previous', 'pdays')


# Target encoding

Now we want to encode the y variable.
Let's replace the values yes/no with 1/0

In [27]:
# Encode the 'y' variable: replace 'yes' with 1 and 'no' with 0
df_selected['y'] = df_selected['y'].replace({'yes': 1, 'no': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['y'] = df_selected['y'].replace({'yes': 1, 'no': 0})


In [28]:
# Verify the changes
print(df_selected['y'].head())

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64


# Split the data

Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value y is not in your dataframe.

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
# Separate the features and target
X = df_selected.drop(columns=['y'])  # Features
y = df_selected['y']                 # Target

In [31]:
# Set the random seed
random_seed = 42

In [32]:
# First split: 60% training and 40% temporary (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_seed)

In [33]:
# Second split: 20% validation and 20% test (from the 40% temporary set)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_seed)

In [34]:
# Check the size of the splits
print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)

Training set size: (27126, 14)
Validation set size: (9042, 14)
Test set size: (9043, 14)


# Question 3

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

- contact
- education
- housing
- poutcome

In [35]:
from sklearn.feature_selection import mutual_info_classif

In [36]:
# List of categorical features
categorical_features = ['contact', 'education', 'housing', 'poutcome']

In [37]:
# Calculate mutual information scores for categorical features
mi_scores = mutual_info_classif(X_train[categorical_features], y_train, discrete_features=True)

ValueError: could not convert string to float: 'unknown'

### I got this error since the mutual_info_classif function requires numerical data, and the categorical features are still in string format. To resolve this, I need to encode the categorical variables before calculating the mutual information score.

In [38]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [39]:
# Encode categorical features
encoded_X_train = X_train[categorical_features].apply(LabelEncoder().fit_transform)

In [40]:
# Calculate mutual information scores for the encoded categorical features
mi_scores = mutual_info_classif(encoded_X_train, y_train, discrete_features=True)

In [41]:
# Round the scores to 2 decimals
mi_scores = [round(score, 2) for score in mi_scores]

In [42]:
# Create a dictionary to pair feature names with their mutual information scores
mi_scores_dict = dict(zip(categorical_features, mi_scores))


In [43]:
# Print the mutual information scores
print("Mutual Information Scores:")
for feature, score in mi_scores_dict.items():
    print(f"{feature}: {score}")

Mutual Information Scores:
contact: 0.01
education: 0.0
housing: 0.01
poutcome: 0.03


In [44]:
# Find the feature with the highest mutual information score
most_informative_feature = max(mi_scores_dict, key=mi_scores_dict.get)
print("\nThe variable with the biggest mutual information score is:", most_informative_feature)


The variable with the biggest mutual information score is: poutcome


Question 4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
- What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [46]:
# Step 1: One-hot encoding for categorical features
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [47]:
# Define the preprocessor: one-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'  # Keep the numerical features as they are
)

In [48]:
# Step 2: Create the logistic regression pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

In [49]:
# Step 3: Fit the model on the training data
model.fit(X_train, y_train)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['job', 'marital',
                                                   'education', 'housing',
                                                   'contact', 'month',
                                                   'poutcome'])])),
                ('classifier',
                 LogisticRegression(max_iter=1000, random_state=42,
                                    solver='liblinear'))])

In [50]:
# Step 4: Make predictions on the validation set
y_pred = model.predict(X_val)

In [51]:
# Step 5: Calculate the accuracy on the validation dataset
accuracy = accuracy_score(y_val, y_pred)

In [52]:
# Round the accuracy to 2 decimal places
accuracy = round(accuracy, 2)
print("Accuracy on the validation set:", accuracy)

Accuracy on the validation set: 0.9


Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?

- age
- balance
- marital
- previous

Note: The difference doesn't have to be positive.

In [53]:
# Step 1: Train the model with all features and get the original accuracy
original_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

In [54]:
# Fit the model on the training data
original_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['job', 'marital',
                                                   'education', 'housing',
                                                   'contact', 'month',
                                                   'poutcome'])])),
                ('classifier',
                 LogisticRegression(max_iter=1000, random_state=42,
                                    solver='liblinear'))])

In [55]:
# Calculate the original accuracy on the validation set
original_accuracy = accuracy_score(y_val, original_model.predict(X_val))

In [56]:
# Step 2: Initialize a dictionary to store accuracy differences for each feature
accuracy_differences = {}

In [57]:
# Step 3: Evaluate the model excluding each feature one by one
for feature in X_train.columns:
    # Exclude the current feature
    X_train_excluded = X_train.drop(columns=[feature])
    X_val_excluded = X_val.drop(columns=[feature])
    
    # Update the preprocessor to fit only the remaining features
    temp_preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), [col for col in categorical_features if col != feature])
        ],
        remainder='passthrough'  # Keep the numerical features as they are, except the excluded one
    )
    
    # Train a new model without the current feature
    model_excluded = Pipeline(steps=[
        ('preprocessor', temp_preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    
    # Fit the model on the training data with the excluded feature
    model_excluded.fit(X_train_excluded, y_train)
    
    # Calculate the accuracy without the feature
    accuracy_without_feature = accuracy_score(y_val, model_excluded.predict(X_val_excluded))
    
    # Calculate the difference from the original accuracy
    accuracy_difference = original_accuracy - accuracy_without_feature
    accuracy_differences[feature] = round(accuracy_difference, 2)

In [58]:
# Step 4: Find the feature with the smallest difference
least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)

In [59]:
print("Differences in accuracy without each feature:", accuracy_differences)
print("\nThe feature with the smallest difference is:", least_useful_feature)

Differences in accuracy without each feature: {'age': 0.0, 'job': 0.0, 'marital': -0.0, 'education': 0.0, 'balance': 0.0, 'housing': -0.0, 'contact': 0.0, 'day': -0.0, 'month': -0.0, 'duration': 0.01, 'campaign': 0.0, 'pdays': 0.0, 'previous': 0.0, 'poutcome': 0.01}

The feature with the smallest difference is: age


Question 6

- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
- Which of these C leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

Note: If there are multiple options, select the smallest C.

In [60]:
# List of C values to try
C_values = [0.01, 0.1, 1, 10, 100]

In [61]:
# Dictionary to store accuracies for each C
accuracy_scores = {}

In [62]:
# Step 1: Train and evaluate models for each C value
for C in C_values:
    # Create a logistic regression model with the current C value
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42))
    ])
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate the accuracy on the validation set
    accuracy = accuracy_score(y_val, y_pred)
    
    # Round the accuracy to 3 decimal places
    accuracy_rounded = round(accuracy, 3)
    
    # Store the accuracy in the dictionary
    accuracy_scores[C] = accuracy_rounded

In [63]:
# Step 2: Find the C value with the best accuracy
best_C = max(accuracy_scores, key=accuracy_scores.get)

In [64]:
# If multiple C values have the same accuracy, choose the smallest one
best_C = min([c for c in accuracy_scores if accuracy_scores[c] == accuracy_scores[best_C]])


In [65]:
# Print the results
print("Accuracies for each C value:", accuracy_scores)
print("\nThe best C value is:", best_C)

Accuracies for each C value: {0.01: 0.899, 0.1: 0.9, 1: 0.901, 10: 0.901, 100: 0.901}

The best C value is: 1
