In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the red wine dataset
red_wine = pd.read_csv('winequality-red.csv', delimiter=';')

# Prepare the data
X_red = red_wine.drop(columns=['quality'])
y_red = red_wine['quality']

# Split the data into training and test sets
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_red, y_red, test_size=0.2, random_state=42)

# Standardize the data
scaler_red = StandardScaler()
X_train_red = scaler_red.fit_transform(X_train_red)
X_test_red = scaler_red.transform(X_test_red)

# Perform PCA
pca_red = PCA(n_components=0.95)  # Keep 95% of variance
X_train_red_pca = pca_red.fit_transform(X_train_red)
X_test_red_pca = pca_red.transform(X_test_red)

# Train a regression model using the principal components
regression_red = LinearRegression()
regression_red.fit(X_train_red_pca, y_train_red)

# Predict and evaluate
y_pred_train_red = regression_red.predict(X_train_red_pca)
y_pred_test_red = regression_red.predict(X_test_red_pca)

print("Red Wine PCA + Regression Train RMSE:", mean_squared_error(y_train_red, y_pred_train_red, squared=False))
print("Red Wine PCA + Regression Test RMSE:", mean_squared_error(y_test_red, y_pred_test_red, squared=False))
print("Red Wine PCA + Regression Train R2:", r2_score(y_train_red, y_pred_train_red))
print("Red Wine PCA + Regression Test R2:", r2_score(y_test_red, y_pred_test_red))


Red Wine PCA + Regression Train RMSE: 0.6525031354487609
Red Wine PCA + Regression Test RMSE: 0.62650510361538
Red Wine PCA + Regression Train R2: 0.34558068824698185
Red Wine PCA + Regression Test R2: 0.3993800679438354
