In [457]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction  import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [458]:
data = pd.read_csv('bank-full.csv')

In [459]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [460]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

# Data Preparation

In [461]:
df = pd.DataFrame()
df = data[['age', 'job', 'marital', 'education', 'balance', 'housing',
        'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y']]

In [462]:
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [463]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

In [464]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Question 1
What is the most frequent observation (mode) for the column education?

In [465]:
df['education'].mode()

0    secondary
Name: education, dtype: object

## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

In [466]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [467]:
(df.dtypes == 'int64').sum()

7

In [468]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [469]:
corr_matrix = df[numerical].corr()
print(corr_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


## Target encoding

In [470]:
df['y']

0         no
1         no
2         no
3         no
4         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 45211, dtype: object

In [471]:
df['y'].value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [472]:
df['y'] = df['y'].str.replace('yes', '1')
df['y'] = df['y'].str.replace('no', '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].str.replace('yes', '1')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].str.replace('no', '0')


In [473]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [474]:
df['y'] = df['y'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].astype(int)


In [475]:
df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [476]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int32
dtype: object

## Split the data

In [477]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [478]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [479]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [480]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [481]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [482]:
del df_train['y']
del df_val['y']
del df_test['y']

## Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).

In [483]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int32
dtype: object

In [484]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [485]:
df_full_train['y'].value_counts()

y
0    31970
1     4198
Name: count, dtype: int64

In [486]:
mutual_info_score(df_full_train.age, df_full_train.y)

0.013488488870579372

In [487]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [488]:
mi = df_full_train[categorical].apply(mutual_info_y_score)
mi.sort_values(ascending = False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

## Question 4
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

## OnhotEncoding

In [489]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

In [490]:
train_dicts[0]

{'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'aug',
 'poutcome': 'unknown',
 'age': 32,
 'balance': 1100,
 'day': 11,
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0}

In [491]:
dv = DictVectorizer(sparse=False)

In [492]:
X_train = dv.fit_transform(train_dicts)

In [493]:
X_train.shape

(27126, 47)

In [494]:
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

## Logistic Regression

In [495]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [496]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.0125799 , 0.00939575, 0.14708588, ..., 0.04843197, 0.00881176,
       0.27923652])

In [497]:
model_decision = (y_pred >= 0.5)

In [498]:
model_decision.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

## Accuracy

In [499]:
(y_val == model_decision).mean().round(2)

0.9

## Question 5

In [500]:
original_acc = (y_val == model_decision).mean()
original_acc

0.9015704490157045

### Without age

In [501]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month',
                'poutcome']
numerical = ['balance', 'day', 'duration', 'campaign',
              'pdays', 'previous']

In [502]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [503]:
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
model_decision = (y_pred >= 0.5)

In [504]:
new_acc = (y_val == model_decision).mean()
new_acc

0.9007962840079629

In [505]:
acc = original_acc - new_acc
acc

0.000774165007741634

### Without balance

In [506]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month',
                'poutcome']
numerical = ['age', 'day', 'duration', 'campaign',
              'pdays', 'previous']

In [507]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [508]:
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
model_decision = (y_pred >= 0.5)

In [509]:
new_acc = (y_val == model_decision).mean()
new_acc

0.9007962840079629

In [510]:
acc = original_acc - new_acc
acc

0.000774165007741634

### Without marital

In [511]:
categorical = ['job', 'education', 'housing', 'contact', 'month',
                'poutcome']
numerical = ['age', 'balance', 'day', 'duration', 'campaign',
              'pdays', 'previous']

In [512]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [513]:
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
model_decision = (y_pred >= 0.5)

In [514]:
new_acc = (y_val == model_decision).mean()
new_acc

0.9010174740101747

In [515]:
acc = original_acc - new_acc
acc

0.0005529750055297544

### Without previous

In [516]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month',
                'poutcome']
numerical = ['age', 'balance', 'day', 'duration', 'campaign',
              'pdays']

In [517]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [518]:
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
model_decision = (y_pred >= 0.5)

In [519]:
new_acc = (y_val == model_decision).mean()
new_acc

0.9007962840079629

In [520]:
acc = original_acc - new_acc
acc

0.000774165007741634

## Question 6
- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

In [521]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month',
                'poutcome']
numerical = ['age', 'balance', 'day', 'duration', 'campaign',
              'pdays', 'previous']

In [522]:
C = [0.01, 0.1, 1, 10, 100]
l=[]
for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    model_decision = (y_pred >= 0.5)
    l.append((c, (y_val == model_decision).mean().round(3)))
print(max(l))
    

(100, 0.901)
