# Regularization - Titanic dataset

In [6]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

In [2]:
data = pd.read_csv("data/titanic_encoded.csv")
data.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,class_First,class_Third,who_child,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,0,1,0,0,0,1
1,1,1,38.0,1,0,71.2833,1,1,0,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,1,0,0,0,0,1
4,0,3,35.0,0,0,8.05,0,0,1,0,0,0,1


In [3]:
y = data["survived"]
X = data.drop(columns="survived")

In [5]:
# We MinMaxScale our features for you
scaler = MinMaxScaler().fit(X)
X_train_scaled = scaler.transform(X)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)

## Logistic Regression without regularization

In [7]:
log_model = LogisticRegression(max_iter=10000).fit(X_train_scaled, y)

permutation_score = permutation_importance(log_model, X_train_scaled, y, n_repeats=10)

importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T)

importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False)

Unnamed: 0,feature,score decrease
5,sex_female,0.207703
1,age,0.027031
7,class_Third,0.017787
8,who_child,0.013305
0,pclass,0.011485
2,sibsp,0.009944
6,class_First,0.005042
3,parch,0.003081
10,embark_town_Queenstown,0.0007
9,embark_town_Cherbourg,0.00056


In [8]:
log_model = LogisticRegression(max_iter=1000, penalty='none').fit(X_train_scaled, y)
pd.DataFrame({'features': X.columns, 'coefs': log_model.coef_[0]})

Unnamed: 0,features,coefs
0,pclass,2.547187
1,age,-2.196151
2,sibsp,-2.477131
3,parch,-0.89382
4,fare,1.358812
5,sex_female,2.671883
6,class_First,2.360417
7,class_Third,-2.456891
8,who_child,1.336356
9,embark_town_Cherbourg,-11.221671


## Logistic Regression - L2

In [9]:
log_model = LogisticRegression().fit(X_train_scaled, y) # Fit model
pd.DataFrame({'features': X.columns, 'coefs': log_model.coef_[0]})

Unnamed: 0,features,coefs
0,pclass,-0.715694
1,age,-1.516539
2,sibsp,-1.561901
3,parch,-0.539578
4,fare,0.482459
5,sex_female,2.482402
6,class_First,0.64899
7,class_Third,-0.78623
8,who_child,1.126462
9,embark_town_Cherbourg,0.26033


## Logistic Regression - L1

In [10]:
log_model = LogisticRegression(solver='liblinear', penalty='l1').fit(X_train_scaled, y) # Fit model
pd.DataFrame({'features': X.columns, 'coefs': log_model.coef_[0]}).sort_values(by='coefs', ascending=False)

Unnamed: 0,features,coefs
5,sex_female,2.538275
8,who_child,1.093132
6,class_First,0.659536
9,embark_town_Cherbourg,0.338242
4,fare,0.0
11,embark_town_Southampton,0.0
10,embark_town_Queenstown,-0.082457
3,parch,-0.202271
7,class_Third,-0.73642
0,pclass,-0.893533
