## Adding Regularization to the Model

In [8]:
import pandas as pd
feats = pd.read_csv('../data/OSI_feats_e3.csv')
target = pd.read_csv('../data/OSI_target_e2.csv')

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.2
random_state = 13
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=test_size, random_state=random_state)

In [10]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_test.shape}')

Shape of X_train: (9864, 68)
Shape of y_train: (9864, 1)
Shape of X_test: (2466, 68)
Shape of y_train: (2466, 1)


In [12]:
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
Cs = np.logspace(-2, 6, 9)
model_l1 = LogisticRegressionCV(Cs=Cs, penalty='l1', cv=10, solver='liblinear', random_state=42, max_iter=10000)
model_l2 = LogisticRegressionCV(Cs=Cs, penalty='l2', cv=10, random_state=42, max_iter=10000)

model_l1.fit(X_train, y_train['Revenue'])
model_l2.fit(X_train, y_train['Revenue'])

In [14]:
print(f'Best hyperparameter for l1 regularization model: {model_l1.C_[0]}')
print(f'Best hyperparameter for l2 regularization model: {model_l2.C_[0]}')

Best hyperparameter for l1 regularization model: 10.0
Best hyperparameter for l2 regularization model: 0.01


In [15]:
y_pred_l1 = model_l1.predict(X_test)
y_pred_l2 = model_l2.predict(X_test)

In [16]:
from sklearn import metrics
accuracy_l1 = metrics.accuracy_score(y_pred=y_pred_l1, y_true=y_test)
accuracy_l2 = metrics.accuracy_score(y_pred=y_pred_l2, y_true=y_test)
print(f'Accuracy of the model with l1 regularization is {accuracy_l1*100:.4f}%')
print(f'Accuracy of the model with l2 regularization is {accuracy_l2*100:.4f}%')

Accuracy of the model with l1 regularization is 89.1727%
Accuracy of the model with l2 regularization is 89.3350%


In [18]:
precision_l1, recall_l1, fscore_l1, _ = metrics.precision_recall_fscore_support(y_pred=y_pred_l1, y_true=y_test, average='binary')
precision_l2, recall_l2, fscore_l2, _ = metrics.precision_recall_fscore_support(y_pred=y_pred_l2, y_true=y_test, average='binary')
print(f'l1\nPrecision: {precision_l1:.4f}\nRecall: {recall_l1:.4f}\nfscore: {fscore_l1: .4f}\n\n')
print(f'l2\nPrecision: {precision_l2:.4f}\nRecall: {recall_l2:.4f}\nfscore: {fscore_l2: .4f}')

l1
Precision: 0.7286
Recall: 0.4050
fscore:  0.5206


l2
Precision: 0.7363
Recall: 0.4134
fscore:  0.5295


In [19]:
coef_list = [f'{feature}: {coef}' for coef, feature in sorted(zip(model_l1.coef_[0], X_train.columns.values.tolist()))]
for item in coef_list:
    print(item)

ExitRates: -16.009484846390706
Browser_11: -3.592835707785565
TrafficType_15: -3.040126119856472
TrafficType_19: -2.790744186945984
OperatingSystems_6: -2.0377534512813567
TrafficType_18: -1.818598979356581
Browser_3: -1.6664330423984195
OperatingSystems_3: -1.442939575286418
OperatingSystems_1: -1.3661862180291797
OperatingSystems_4: -1.3338862020698212
Browser_13: -1.3327031980935446
OperatingSystems_2: -1.1757977131630206
Browser_6: -1.10493105695464
Browser_7: -0.922122032654981
TrafficType_13: -0.9015419721588587
OperatingSystems_8: -0.8251399623701103
TrafficType_14: -0.7220271174248961
Browser_2: -0.6169689114766488
TrafficType_3: -0.5849748832151968
Browser_4: -0.5339172467939178
Browser_5: -0.5244472855932206
Browser_1: -0.5058656933293494
Browser_8: -0.4791484495744584
TrafficType_1: -0.3743204284388919
TrafficType_6: -0.3727852499814628
VisitorType_Returning_Visitor: -0.31105967077558194
TrafficType_4: -0.2751499744031358
BounceRates: -0.270187111115926
TrafficType_2: -0.231

In [20]:
coef_list = [f'{feature}: {coef}' for coef, feature in sorted(zip(model_l2.coef_[0], X_train.columns.values.tolist()))]
for item in coef_list:
    print(item)

Month_May: -0.2543517004725788
TrafficType_13: -0.21678870746777357
VisitorType_Returning_Visitor: -0.1933752912344788
TrafficType_3: -0.18858378940440287
Month_Dec: -0.18519132710158503
Month_Mar: -0.17140547271732584
OperatingSystems_3: -0.1408866170032516
ExitRates: -0.11740773716263075
TrafficType_1: -0.11055967513199383
SpecialDay: -0.10761089763698553
BounceRates: -0.09227826561858632
Region_4: -0.06902841083784055
Browser_6: -0.06437675283093477
Region_9: -0.06200945039187729
Browser_3: -0.055886277167606864
Month_June: -0.03242660249187416
TrafficType_6: -0.03180154669417395
TrafficType_15: -0.02341020614782407
Browser_2: -0.023165062104160278
Browser_13: -0.01881506346817186
Browser_7: -0.017168648101402142
Region_1: -0.016296983376358248
OperatingSystems_6: -0.015498026889709242
TrafficType_19: -0.013026555756653416
Region_7: -0.012330181180759129
Region_3: -0.010248409052295094
OperatingSystems_8: -0.007972449754517877
OperatingSystems_4: -0.007934530103885649
Informational: