# Import Modules and Create Pandas Dataframe

In [158]:
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Create pandas dataframe from the dataset
df = pd.read_csv('bank/bank-full.csv', sep=';')

# Delete the columns not needed for the homework
del df['default']
del df['loan']

# View the shape of the dataframe: has 45,211 rows and 15 columns 
print(df.shape)
df.head(5).T

(45211, 15)


Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


In [159]:
# Check for missing values: there are no missing values in any column
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [160]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

## Question 1: What is the most frequent observation (mode) for the `education` column?

In [161]:
print(f'Mode of the education column is "{df.mode()['education'][0]}"')

Mode of the education column is "secondary"


## Question 2: Create the correlation matrix for the numerical features in your dataset.
### Compute the correlation coefficient between every pair of features.

In [162]:
# Create separate lists for categorical and numerical variables in our dataset
numerical = ['age','balance','day','duration','campaign','pdays','previous']
categorical = ['job','marital','education','housing','contact','month','poutcome']

In [163]:
# Calculate correlation between numeric features
# 'pdays' and 'previous' have the biggest correlation of 0.454820 
corr_matrix = df[numerical].corr()
corr_matrix
# age & balance: 0.097783
# day & campaign: 0.162490
# day and pdays: -0.093044
# pdays and previous: 0.454820

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


### Target Encoding

In [164]:
# Target Encoding: Convert target column yes/no to 1/0
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [166]:
df['y'].unique()

array([0, 1])

### Split the data with seed=42

In [170]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [175]:
df_train,df_val = train_test_split(df_train_full,test_size=0.2, random_state=42)

In [176]:
y_train = df_train.y.values
y_val = df_val.y.values

In [177]:
del df_train['y']
del df_val['y']

In [178]:
print(f'Size of datasets (row,col): training_full dataset - {df_train_full.shape}, train - {df_train.shape}, val: {df_val.shape}, test: {df_test.shape}')

Size of datasets (row,col): training_full dataset - (36168, 15), train - (28934, 14), val: (7234, 14), test: (9043, 15)


## Question 3: Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only. Round the scores to 2 decimals.

In [182]:
# poutcome has the biggest mutual information score
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return round(mutual_info_score(series, df_train_full.y),2)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
poutcome,0.03
month,0.02
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


## Question 4: Logistic Regression - Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

### One-Hot Encoding of categorical variables

In [183]:
# Convert training dataframe to a list of dictionaries
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [185]:
# Use DictVectorizer
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [189]:
# Convert dictionaries to a matrix
X_train = dv.transform(train_dict)

In [190]:
# Perform logistic regression
import math
def sigmoid(score):
    return 1/(1 + math.exp(-score))

def logistic_regression(xi):
    score = bias
    for j in range(n):
        score = score + xi[j]*w[j]
    prob = sigmoid(score)
    return prob    

In [191]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [193]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [195]:
y_pred = model.predict_proba(X_val)[:,1]

In [197]:
array = y_pred >= 0.5

In [201]:
# Measure accuracy
print(f'Accuracy is {round((y_val == array).mean(),2)}')

Accuracy is 0.9
