In [20]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [21]:
# Loading the dataset
df = pd.read_csv('wine_quality.csv')
print(df.columns)

In [22]:
# Extracting the target variable and features
y = df['quality']
features = df.drop(columns=['quality'])

In [23]:
## 1. Data transformation
# Scaling the features using StandardScaler
from sklearn.preprocessing import StandardScaler
standard_scaler_fit = StandardScaler().fit(features)
X = standard_scaler_fit.transform(features)

In [24]:
## 2. Train-test split
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [25]:
## 3. Fitting a logistic regression classifier without regularization
# Training a logistic regression classifier with no regularization
from sklearn.linear_model import LogisticRegression

clf_no_reg = LogisticRegression(penalty='none')
clf_no_reg.fit(X_train, y_train)


In [26]:
## 4. Plotting the coefficients
# Visualizing the coefficients of the logistic regression model
predictors = features.columns
coefficients = clf_no_reg.coef_.ravel()
coef = pd.Series(coefficients, predictors).sort_values()
coef.plot(kind='bar', title='Coefficients (no regularization)')
plt.tight_layout()
plt.show()
plt.clf()

In [27]:
## 5. Training and test performance
# Evaluating the training and test performance using F1 score
from sklearn.metrics import f1_score
y_pred_test = clf_no_reg.predict(X_test)
y_pred_train = clf_no_reg.predict(X_train)
print('Training Score', f1_score(y_train, y_pred_train))
print('Testing Score', f1_score(y_test, y_pred_test))

In [28]:
## 6. Default Implementation (L2-regularized!)
# Training a logistic regression classifier with L2 regularization (default)
clf_default = LogisticRegression()
clf_default.fit(X_train, y_train)

In [29]:
## 7. Ridge Scores
# Evaluating the training and test performance of the L2-regularized model
y_pred_train_ridge = clf_default.predict(X_train)
y_pred_test_ridge = clf_default.predict(X_test)
print(' Ridge-regularized Training Score', f1_score(y_train, y_pred_train_ridge))
print('Ridge-regularized Testing Score', f1_score(y_test, y_pred_test_ridge))

In [30]:
## 8. Coarse-grained hyperparameter tuning
# Tuning the hyperparameter C for logistic regression
training_array = []
test_array = []
C_array = [0.0001, 0.001, 0.01, 0.1, 1]
for x in C_array:
    clf = LogisticRegression(C=x)
    clf.fit(X_train, y_train)
    y_pred_test = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    training_array.append(f1_score(y_train, y_pred_train))
    test_array.append(f1_score(y_test, y_pred_test))

In [31]:
## 9. Plotting training and test scores as a function of C
# Visualizing training and test scores for different values of C
plt.plot(C_array, training_array, label='Training Score')
plt.plot(C_array, test_array, label='Test Score')
plt.xscale('log')
plt.xlabel('C')
plt.legend()
plt.show()
plt.clf()

In [32]:
## 10. Making a parameter grid for GridSearchCV
# Defining a parameter grid for GridSearchCV
C_array = np.logspace(-4, -2, 100)
# Making a dict to enter as an input to param_grid
tuning_C = {'C': C_array}


In [33]:
## 11. Implementing GridSearchCV with l2 penalty
# Using GridSearchCV to find the optimal value of C with L2 penalty
from sklearn.model_selection import GridSearchCV
clf_gs = LogisticRegression()
gs = GridSearchCV(clf_gs, param_grid=tuning_C, scoring='f1', cv=5)
gs.fit(X_train, y_train)

In [34]:
## 12. Optimal C value and the score corresponding to it
# Printing the optimal C value and the corresponding score
print(gs.best_params_, gs.best_score_)

In [35]:
## 13. Validating the "best classifier"
# Validating the best classifier found by GridSearchCV
clf_best = LogisticRegression(C=gs.best_params_['C'])
clf_best.fit(X_train, y_train)
y_pred_best = clf_best.predict(X_test)
print(f1_score(y_test, y_pred_best))

In [36]:
## 14. Implementing L1 hyperparameter tuning with LogisticRegressionCV
# Tuning hyperparameters with L1 penalty using LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV
C_array = np.logspace(-2, 2, 100)
clf_l1 = LogisticRegressionCV(Cs=C_array, cv=5, penalty='l1', scoring='f1', solver='liblinear')
clf_l1.fit(X, y)

In [37]:

## 15. Optimal C value and corresponding coefficients
# Printing the best C value and corresponding coefficients for L1-regularized model
print('Best C value', clf_l1.C_)
print('Best fit coefficients', clf_l1.coef_)

In [38]:
## 16. Plotting the tuned L1 coefficients
# Visualizing the coefficients of the tuned L1-regularized model
coefficients = clf_l1.coef_.ravel()
coef = pd.Series(coefficients, predictors).sort_values()
coef.plot(kind='bar', title='Coefficients for tuned L1')
plt.tight_layout()
plt.show()
plt.clf()