In [1]:
import numpy as np
import pandas as pd


In [3]:
file_path = 'bank-full.csv'
df = pd.read_csv(file_path, sep=';')

In [11]:
# Select the specified columns
columns_to_use = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
    'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]
df_selected = df[columns_to_use]

# Check for missing values
missing_values = df_selected.isnull().sum()

df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [13]:
df = df[[
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]]

In [None]:
Question 1: What is the most frequent observation (mode) for the column education?

In [21]:
# Find the mode of the 'education' column
df_selected['education'].mode()


0    secondary
Name: education, dtype: object

In [None]:
Question 2: What are the two features that have the biggest correlation?

In [27]:

numerical_features = df_selected[['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]


correlation_matrix = numerical_features.corr()


print("Correlation matrix:")
print(correlation_matrix)

# Find the pair of features with the highest correlation
correlation_pairs = correlation_matrix.unstack().sort_values(kind="quicksort", ascending=False)
#Excluding the corelation with themselves
correlation_pairs = correlation_pairs[correlation_pairs < 1]


# Display the two features with the highest correlation
highest_correlation = correlation_pairs.idxmax()
highest_correlation_value = correlation_pairs.max()

print(f"The two features with the biggest correlation are: {highest_correlation} with a correlation of {highest_correlation_value:.2f}")

Correlation matrix:
               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
The two features with the biggest correlation are: ('pdays', 'previous') with a correlation of 0.45


In [None]:
Target Encoding

In [29]:
df = (
    df
    .assign(y=(df.y == "yes").astype(int))
)

In [39]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_full_train = df_full_train.y.values
y_train = df_train.y.values
y_test = df_test.y.values
y_val = df_val.y.values

df_full_train = df_full_train.drop(columns="y")
df_train = df_train.drop(columns="y")
df_test = df_test.drop(columns="y")
df_val = df_val.drop(columns="y")

In [41]:
from sklearn.metrics import mutual_info_score

for col in ["contact", "education", "housing", "poutcome"]:
    print(round(mutual_info_score(y_train, df_train[col]), 2))

0.01
0.0
0.01
0.03


In [None]:
Question 3: Which variable has the biggest mutual information score?
ans: Poutcome

In [None]:
Question 4: What accuracy did you get?

In [43]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_full_train = df_full_train.to_dict(orient="records")
dicts_train = df_train.to_dict(orient="records")
dicts_test = df_test.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)

(y_pred_val == y_val).mean().round(2)

0.9

In [45]:
accuracy_all = (y_pred_val == y_val).mean()

In [None]:
Question 5: Which feature has the smallest difference?
ans:previous

In [53]:
results = []

for feature_to_exclude in df_train.columns:
    
    dicts_train = df_train.drop(columns=feature_to_exclude).to_dict(orient="records")
    dicts_val = df_val.drop(columns=feature_to_exclude).to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    dv.fit(dicts_train)

    X_train = dv.transform(dicts_train)
    X_val = dv.transform(dicts_val)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)
    accuracy = (y_pred_val == y_val).mean()
    difference = abs(accuracy - accuracy_all)
    results.append((feature_to_exclude, accuracy, difference))

    # print(f"Excluded feature '{col_to_exclude}', Accuracy: {accuracy}, Accuracy difference with baseline: {(accuracy-accuracy_all).round(2)}")

    df_results = pd.DataFrame(data=results, columns=["excluded feature", "accuracy", "difference"])
   

In [54]:
df_results.sort_values(by="difference")

Unnamed: 0,excluded feature,accuracy,difference
7,day,0.901017,0.0
12,previous,0.901017,0.0
1,job,0.900907,0.000111
4,balance,0.90146,0.000442
11,pdays,0.90146,0.000442
6,contact,0.900575,0.000442
3,education,0.900464,0.000553
2,marital,0.900243,0.000774
0,age,0.900133,0.000885
5,housing,0.900022,0.000995


In [None]:
Question 6: Which of these C leads to the best accuracy on the validation set?

In [57]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_full_train = df_full_train.to_dict(orient="records")
dicts_train = df_train.to_dict(orient="records")
dicts_test = df_test.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

results = []

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    accuracy = (y_pred_val == y_val).mean().round(3)

    results.append((c, accuracy))

    df_results = pd.DataFrame(data=results, columns=["C", "accuracy"])

In [59]:
df_results.iloc[df_results["accuracy"].idxmax()]

C           10.000
accuracy     0.902
Name: 3, dtype: float64