<a href="https://colab.research.google.com/github/riddlemeS4m/machine-learning-scientist-datacamp/blob/dev-google/module-5-linear-classifiers/regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Insert Title Here]

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [None]:
!pip install scikit-learn



In [None]:
# regularization combats overfitting by making the model coefficients smaller

In [None]:
# hyperparameter C is inverse of regularization strength: larger C, less regularization

In [None]:
lr_weak_reg = LogisticRegression(C=100)
lr_strong_reg = LogisticRegression(C=0.01)

In [None]:
lr_weak_reg.fit(X_train, y_train) # 1.0
lr_strong_reg.fit(X_train, y_train) # 0.92

In [None]:
# regularization is an extra term we add to the original loss function, which penalizes large values of coefficients, distracting from the goal of optimizing accuracy
# without regularization, we improve training accuracy
# the smaller the value of C, the more we distract from optimizing accuracy

In [None]:
# regularized loss = original loss + large coefficient penalty
# more regularization: lower training accuracy
# more regularization: (almost always) higher test accuracy

In [None]:
lr_weak_reg.score(X_test, y_test) # 0.86
lr_strong_reg.score(X_test, y_test) # 0.88

In [None]:
# regularizing is compromising between not using a feature at all and fully using it
# if using a feature too heavily was causing overfitting, then regularization causes you to "fit less"
# in this case, the test accuracy improves because we overfitted less than without regularization

In [None]:
# lasso - linear regression with L1 regularization
# ridge - linear regression with L2 regularization

In [None]:
# everything about l1 and l2 also applies to logistic regression, both reduce overfitting
# l1 can perform feature selection

In [None]:
lr_L1 = LogisticRegression(solver='liblinear', penalty='l1') # solver argument controls optimization method used to find coefficients, default solver not compatible with l1
lr_L2 = LogisticRegression() # penalty l2 by default

lr_L1.fit(X_train, y_train)
lr_L2.fit(X_train, y_train)

In [None]:
plt.plot(lr_L1.coef_.flatten())
plt.plot(lr_L2.coef_.flatten())

In [None]:
# result l1 sets coefficients to zero, peforming feature selection
# result l2 just shrinks coefficients

Exercise

In [None]:
# Train and validaton errors initialized as empty list
train_errs = list()
valid_errs = list()

# Loop over values of C_value
for C_value in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    # Create LogisticRegression object and fit
    lr = LogisticRegression(C=C_value)
    lr.fit(X_train, y_train)

    # Evaluate error rates and append to lists
    train_errs.append( 1.0 - lr.score(X_train, y_train) )
    valid_errs.append( 1.0 - lr.score(X_valid, y_valid) )

# Plot results
plt.semilogx(C_values, train_errs, C_values, valid_errs)
plt.legend(("train", "validation"))
plt.show()

Exercise

In [None]:
# Specify L1 regularization
lr = LogisticRegression(solver='liblinear', penalty='l1')

# Instantiate the GridSearchCV object and run the search
searcher = GridSearchCV(lr, {'C':[0.001, 0.01, 0.1, 1, 10]})
searcher.fit(X_train, y_train)

# Report the best parameters
print("Best CV params", searcher.best_params_)

# Find the number of nonzero coefficients (selected features)
best_lr = searcher.best_estimator_
coefs = best_lr.coef_
print("Total number of features:", coefs.size)
print("Number of selected features:", np.count_nonzero(coefs))

Exercise

In [None]:
# Get the indices of the sorted cofficients
inds_ascending = np.argsort(lr.coef_.flatten())
inds_descending = inds_ascending[::-1]

# Print the most positive words
print("Most positive words: ", end="")
for i in range(5):
    print(vocab[inds_descending[i]], end=", ")
print("\n")

# Print most negative words
print("Most negative words: ", end="")
for i in range(5):
    print(vocab[inds_ascending[i]], end=", ")
print("\n")