In [2]:
# Wine Quality Prediction Project

## Problem Description
#We want to predict the quality score (0-10) of red wines based on chemical properties like acidity, sugar, and alcohol. This helps winemakers assess quality early without relying solely on expensive human tasters. It's a regression problem because quality is a numerical score.

## How ML Helps
#ML models can learn patterns from data (e.g., higher alcohol often means better quality). We use supervised learning: train on features to predict scores. Benefits: Automation, consistency. Limitations: Scores are subjective and imbalanced (mostly 5-6).

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv")

In [5]:
df

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
...,...
1594,6.2;0.6;0.08;2;0.09;32;44;0.9949;3.45;0.58;10.5;5
1595,5.9;0.55;0.1;2.2;0.062;39;51;0.99512;3.52;0.76...
1596,6.3;0.51;0.13;2.3;0.076;29;40;0.99574;3.42;0.7...
1597,5.9;0.645;0.12;2;0.075;32;44;0.99547;3.57;0.71...


In [6]:
df.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [7]:
# Quick checks
print(df.head())  # First 5 rows
print(df.info())  # Data types, no nulls
print(df.describe())  # Stats like mean, min/max
print(df.isnull().sum())  # Confirm no missing values

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [9]:
# Split features/target and train/test (80/20)
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: "['quality'] not found in axis"

In [10]:
# Scale features (important for regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

NameError: name 'X_train' is not defined

In [11]:
# Exploratory data anaylsis

In [12]:
# Target distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['quality'], bins=6, kde=True)
plt.title('Wine Quality Distribution')
plt.show()
# Insight: Mostly 5-6, imbalanced—models might bias toward average.

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlations')
plt.show()
# Insight: Alcohol (+0.48) good for quality; volatile acidity (-0.39) bad.

# Boxplots for outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=X)
plt.xticks(rotation=45)
plt.title('Feature Boxplots')
plt.show()
# Insight: Some outliers in sulfur—could clip if needed, but skip for now.

# Pairplot for key features
sns.pairplot(df[['alcohol', 'volatile acidity', 'quality']], hue='quality')
plt.show()
# Insight: Alcohol increases with quality—non-linear?

KeyError: 'quality'

<Figure size 800x600 with 0 Axes>

In [13]:
#training

In [14]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Models and params
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(random_state=42)
}

params = {
    'Linear': {},
    'Ridge': {'alpha': [0.1, 1.0, 10.0]},
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [5, 10]}
}

results = []
best_model = None
best_rmse = float('inf')

for name, model in models.items():
    if params[name]:
        grid = GridSearchCV(model, params[name], cv=5, scoring='neg_mean_squared_error')
        grid.fit(X_train_scaled, y_train)
        model = grid.best_estimator_
        print(f'Best params for {name}: {grid.best_params_}')
    else:
        model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results.append({'Model': name, 'RMSE': rmse, 'R²': r2})
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model

# Results table
results_df = pd.DataFrame(results)
print(results_df)

# Save best model and scaler
import joblib
joblib.dump(best_model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Plot actual vs predicted for best
y_pred = best_model.predict(X_test_scaled)
plt.scatter(y_test, y_pred)
plt.plot([3, 8], [3, 8], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

NameError: name 'X_train_scaled' is not defined