In [41]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [42]:
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
response = requests.get(url)

# zip-data
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # open "bank.zip"
    with z.open("bank.zip") as inner_zip:
        # open inner ZIP (bank.zip)
        with zipfile.ZipFile(inner_zip) as bank_zip:
            # open "bank-full.csv" data from "bank.zip"
            with bank_zip.open("bank-full.csv") as csv_file:
                # reading
                df = pd.read_csv(csv_file, sep=";")
                
# show
df.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no


In [43]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [44]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


In [45]:
# unique value of each column
for col in df.columns:
    print(col)
    print(df[col].nunique())

age
77
job
12
marital
3
education
4
default
2
balance
7168
housing
2
loan
2
contact
3
day
31
month
12
duration
1573
campaign
48
pdays
559
previous
41
poutcome
4
y
2


In [46]:
df_values = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day',
'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df_reduced = df[df_values]
df_reduced.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [47]:
#checking for missing values
missing_values = df_reduced.isnull().sum()
print(missing_values)

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [48]:
# Question 1

In [49]:
# most frequent mode for 'education' 
education_mode = df_reduced['education'].mode()[0]
print(f"The most frequent observation mode for 'education' is: {education_mode}")

The most frequent observation mode for 'education' is: secondary


In [50]:
# Question 2

In [51]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [52]:
# correlation with one single feature
df_reduced[numerical_columns].corrwith(df_reduced.age).abs()

age         1.000000
balance     0.097783
day         0.009120
duration    0.004648
campaign    0.004760
pdays       0.023758
previous    0.001288
dtype: float64

In [53]:
# correlation matrix of all the feaatures
correlation_matrix = df_reduced[numerical_columns].corr()
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [54]:
# transform them in series
corr_pairs = correlation_matrix.unstack()

# remove duplikates / self correlation
corr_pairs = corr_pairs[corr_pairs != 1].sort_values(ascending=False)

# two highest corr
highest_corr = corr_pairs.head(1)
print(f"The two features with highest correlation are : {highest_corr}")

The two features with highest correlation are : previous  pdays    0.45482
dtype: float64


In [55]:
# replacing yes and no with 1 and 0
pd.set_option('future.no_silent_downcasting', True)
df_reduced['y'] = df_reduced['y'].replace({'yes': 1, 'no': 0}).astype(int)

# printing unique values and data types
print(df_reduced['y'].dtype) 
print(df_reduced['y'].unique())

int64
[0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['y'] = df_reduced['y'].replace({'yes': 1, 'no': 0}).astype(int)


In [56]:
# splitting dataframe
df_full_train, df_test = train_test_split(df_reduced, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [57]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [58]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [59]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [60]:
# Question 3

In [61]:
# mutual information score
features = ['contact', 'education', 'housing', 'poutcome']
for feature in features:
    score = mutual_info_score(y_train, df_train[feature])
    rounded_score = round(score, 2)
    print(f"Mutual Information score between y and {feature}: {rounded_score}")

Mutual Information score between y and contact: 0.01
Mutual Information score between y and education: 0.0
Mutual Information score between y and housing: 0.01
Mutual Information score between y and poutcome: 0.03


In [62]:
mutual_info_score(y_train, df_train.contact)

np.float64(0.013356062198247219)

In [63]:
mutual_info_score(y_train, df_train.education)

np.float64(0.0026967549991295282)

In [64]:
mutual_info_score(y_train, df_train.housing)

np.float64(0.010343105891750026)

In [65]:
mutual_info_score(y_train, df_train.poutcome)

np.float64(0.029532821290436224)

In [66]:
# Question 4

In [67]:
# one hot encoding
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [68]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

print(X_train.shape)  
print(X_val.shape)    

(27126, 47)
(9042, 47)


In [69]:
# fit model on train data set
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [70]:
# accuracy score on val set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

# rounding
rounded_accuracy = round(accuracy, 2)

print(f"Accuracy on the validation dataset: {rounded_accuracy}")

Accuracy on the validation dataset: 0.9


In [71]:
# Question 5

In [72]:
from copy import deepcopy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Original accuracy with all features
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
original_accuracy = accuracy_score(y_val, y_pred)
print(original_accuracy)

# Store the results
feature_accuracies = {}

# Loop through each feature
for feature in categorical_columns + numerical_columns:
    # Create a copy of the original dataframe without the current feature
    reduced_train = deepcopy(df_train)
    reduced_val = deepcopy(df_val)
    
    # Drop the current feature from both train and val sets
    if feature in reduced_train.columns:  # Ensure the feature is in the DataFrame
        reduced_train = reduced_train.drop(columns=[feature])
        reduced_val = reduced_val.drop(columns=[feature])
    
    # Update the list of columns to avoid accessing removed features
    reduced_categorical_columns = [f for f in categorical_columns if f in reduced_train.columns]
    reduced_numerical_columns = [f for f in numerical_columns if f in reduced_train.columns]

    # Transform the reduced dataset
    train_dict = reduced_train[reduced_categorical_columns + reduced_numerical_columns].to_dict(orient='records')
    X_train_reduced = dv.fit_transform(train_dict)
    
    val_dict = reduced_val[reduced_categorical_columns + reduced_numerical_columns].to_dict(orient='records')
    X_val_reduced = dv.transform(val_dict)
    
    # Train model on reduced dataset
    model.fit(X_train_reduced, y_train)
    y_pred_reduced = model.predict(X_val_reduced)
    
    # Calculate accuracy
    reduced_accuracy = accuracy_score(y_val, y_pred_reduced)
    feature_accuracies[feature] = original_accuracy - reduced_accuracy

# Output the feature differences
for feature, acc_diff in feature_accuracies.items():
    print(f"Feature: {feature}, Accuracy difference: {acc_diff:.4f}")


0.9012386640123866
Feature: job, Accuracy difference: 0.0000
Feature: marital, Accuracy difference: 0.0000
Feature: education, Accuracy difference: 0.0004
Feature: housing, Accuracy difference: 0.0010
Feature: contact, Accuracy difference: 0.0009
Feature: month, Accuracy difference: 0.0014
Feature: poutcome, Accuracy difference: 0.0073
Feature: age, Accuracy difference: 0.0001
Feature: balance, Accuracy difference: 0.0001
Feature: day, Accuracy difference: 0.0001
Feature: duration, Accuracy difference: 0.0117
Feature: campaign, Accuracy difference: 0.0011
Feature: pdays, Accuracy difference: 0.0002
Feature: previous, Accuracy difference: 0.0003


In [73]:
# Question 6

In [74]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

#differnt c values
C = [0.01, 0.1, 1, 10, 100]

for val in C:
    # fit model on train data set
    model = LogisticRegression(solver='liblinear', C=val, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # accuracy score on val set
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    # rounding
    rounded_accuracy = round(accuracy, 3)
    
    print(f"Accuracy on the validation dataset with C={val}: {rounded_accuracy}") #c0.1

Accuracy on the validation dataset with C=0.01: 0.898
Accuracy on the validation dataset with C=0.1: 0.901
Accuracy on the validation dataset with C=1: 0.901
Accuracy on the validation dataset with C=10: 0.901
Accuracy on the validation dataset with C=100: 0.9
