# Classification

## Dataset

In [1]:
#http_data = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"

In [2]:
#!wget $http_data -O bank-full.csv

--2024-10-14 22:35:24--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank-full.csv’

bank-full.csv           [          <=>       ] 999.85K   382KB/s    in 2.6s    

2024-10-14 22:35:28 (382 KB/s) - ‘bank-full.csv’ saved [1023843]



In [8]:
import pandas as pd

In [13]:
df = pd.read_csv('bank-full.csv', sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Features

In [16]:
features = ["age","job","marital","education","balance","housing","contact","day","month","duration","campaign","pdays",
            "previous","poutcome", "y"]

## Data preparation

In [17]:
df_new = df[features]
df_new.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [19]:
df_new.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [21]:
df_new.columns = df_new.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df_new.dtypes[df_new.dtypes == 'object'].index)

for c in categorical_columns:
    df_new[c] = df_new[c].str.lower().str.replace(' ', '_')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new[c] = df_new[c].str.lower().str.replace(' ', '_')


## Question 1 

In [22]:
df_new["education"].mode()

0    secondary
Name: education, dtype: object

## Question 2

In [31]:
df_new.corr(numeric_only=True)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


The two features that have the biggest correlation are :  **pdays** and **previous**


## Target encoding

In [34]:
df_new.y = (df_new.y == 'yes').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.y = (df_new.y == 'yes').astype(int)


## Split the data

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
df_full_train, df_test = train_test_split(df_new, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [41]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [42]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']


## Question 3

In [43]:
from sklearn.metrics import mutual_info_score

In [44]:
df_full_train = df_full_train.reset_index(drop=True)

In [47]:
def mutual_info_churn_score(series):
    score = mutual_info_score(series, df_full_train.y)
    return round(score, 2)

In [48]:
categorical = ["contact", "education", "housing", "poutcome"]

In [49]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

poutcome     0.03
contact      0.01
housing      0.01
education    0.00
dtype: float64

## Question 4

### (a) One-Hot Encoder

In [50]:
from sklearn.feature_extraction import DictVectorizer

In [53]:
categorical_columns = list(df_train.dtypes[df_train.dtypes == 'object'].index)
numerical_columns = list(df_train.dtypes[df_train.dtypes != 'object'].index)

In [55]:
numerical_columns

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [56]:
categorical_columns

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [57]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

### (b) Model

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [60]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [61]:
model.fit(X_train, y_train)

In [85]:
y_pred = model.predict_proba(X_val)[:, 1]

In [86]:
decision = (y_pred >= 0.5)

In [87]:
score = (y_val == decision).mean()
float(round(score, 2))

0.9

In [88]:
score

np.float64(0.8999115239991152)

## Question 5

In [82]:
list_features = ["age", "balance", "marital", "previous"]
list_features

['age', 'balance', 'marital', 'previous']

In [75]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'age': np.float64(-0.003),
 'balance': np.float64(0.0),
 'campaign': np.float64(-0.109),
 'contact=cellular': np.float64(0.061),
 'contact=telephone': np.float64(0.032),
 'contact=unknown': np.float64(-1.082),
 'day': np.float64(-0.005),
 'duration': np.float64(0.004),
 'education=primary': np.float64(-0.519),
 'education=secondary': np.float64(-0.286),
 'education=tertiary': np.float64(-0.069),
 'education=unknown': np.float64(-0.115),
 'housing=no': np.float64(-0.065),
 'housing=yes': np.float64(-0.923),
 'pdays': np.float64(-0.001),
 'poutcome=failure': np.float64(-0.732),
 'poutcome=other': np.float64(-0.37),
 'poutcome=success': np.float64(1.48),
 'poutcome=unknown': np.float64(-1.366),
 'previous': np.float64(0.001)}

In [80]:
def calculate_accuracy(feature_to_drop):
    columns = df_train.columns.tolist()
    columns.remove(feature_to_drop)

    df_train[columns].to_dict(orient='records')
    dicts_train_small = df_train[columns].to_dict(orient='records')
    dicts_val_small = df_val[columns].to_dict(orient='records')

    dv_small = DictVectorizer(sparse=False)
    dv_small.fit(dicts_train_small)

    dv_small.get_feature_names_out()

    X_train_small = dv_small.transform(dicts_train_small)
    X_val_small = dv_small.transform(dicts_val_small)
    
    

    model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_small.fit(X_train_small, y_train)

    y_pred = model_small.predict_proba(X_val_small)[:, 1]

    decision = (y_pred >= 0.5)
    
    score = (y_val == decision).mean()

    return score

In [81]:
score_test = calculate_accuracy("age")
score_test

np.float64(0.9010174740101747)

In [84]:
for e in list_features:
    accuracy_feature = calculate_accuracy(e)
    print(f"For the feature {e}, we have accuracy equal to {accuracy_feature}") 
    print(" ") 

For the feature age, we have accuracy equal to 0.9010174740101747
 
For the feature balance, we have accuracy equal to 0.9007962840079629
 
For the feature marital, we have accuracy equal to 0.9002433090024331
 
For the feature previous, we have accuracy equal to 0.9012386640123866
 


## Question 6

In [89]:
dicts_full_train = df_full_train[categorical_columns + numerical_columns].to_dict(orient='records')

In [90]:
list_c = [0.01, 0.1, 1, 10, 100]

In [91]:
def train_model(param_C):
    model = LogisticRegression(solver='liblinear', C=param_C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]

    decision = (y_pred >= 0.5)
    
    score = (y_val == decision).mean()

    return round(score, 3)

In [93]:
for C in list_c:
    ac_C = train_model(C)
    print(f"For C = {C}, we have accuracy equal to {ac_C}") 
    print(" ") 

For C = 0.01, we have accuracy equal to 0.898
 
For C = 0.1, we have accuracy equal to 0.9
 
For C = 1, we have accuracy equal to 0.9
 
For C = 10, we have accuracy equal to 0.9
 
For C = 100, we have accuracy equal to 0.9
 
