<a href="https://colab.research.google.com/github/sergekamanzi/wine_quality/blob/main/red_wine_quality_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**importing PANDAS for my dataset**

In [3]:
import pandas as pd

In [4]:
#%%capture
# Load datasets for red and white wines
red = pd.read_csv('/content/winequality-red.csv', sep=';')
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [37]:
%%capture
red.describe()

In [21]:
%%capture
red.isnull().sum()

In [23]:
%%capture
red.info()

**Statistical Summary and Feature Correlation of Red Wine Data**

In [5]:
%%capture
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation heatmap to see relationships between features
plt.figure(figsize=(10, 8))
sns.heatmap(red.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Red Wine Features')
plt.show()


**standard my datas**



In [6]:
from sklearn.preprocessing import StandardScaler


# Features that need scaling
X=red.drop(['quality'], axis=1)
Y=red['quality']


# Initialize the scaler
scaler = StandardScaler()


# Fit and transform the features
X_scaled = scaler.fit_transform(X)


# Now X_scaled contains the standardized features


In [7]:
# Check the shape of the NumPy arrays
X.shape,Y.shape

((1599, 11), (1599,))

**random forest model**

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
import numpy as np

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

**Linear Regression Model for Predicting Red Wine Quality**

In [11]:
model_fit= RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_fit.fit(x_train, y_train)

In [17]:
test_pred = model_fit.predict(x_test)

In [52]:
# R squared error
error_score = r2_score(y_test, test_pred)
print("R squared error: ", error_score)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, test_pred)
print("Mean Squared Error (MSE): ", mse)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE): ", rmse)


R squared error:  0.04840179919006571
Mean Squared Error (MSE):  0.621875
Root Mean Squared Error (RMSE):  0.7885905147793751


In [25]:
%%capture
plt.scatter(y_test, test_pred)
plt.xlabel("Actual Quality")
plt.ylabel("Predicted Quality")
plt.title("Actual vs Predicted Quality for training")
plt.show()

**Linear Regression model**

In [53]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [29]:
model = LinearRegression()
model.fit(X_train, y_train)

In [30]:
test_pred = model.predict(X_test)

In [54]:
# R-squared error
error_score = metrics.r2_score(y_test, test_pred)
print("R squared error:", error_score)

# Mean Squared Error (MSE)
mse = metrics.mean_squared_error(y_test, test_pred)
print("Mean Squared Error (MSE):", mse)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

R squared error: 0.04840179919006571
Mean Squared Error (MSE): 0.621875
Root Mean Squared Error (RMSE): 0.7885905147793751


In [33]:
%%capture
plt.scatter(y_test, test_pred)
plt.xlabel("Actual Quality")
plt.ylabel("Predicted Quality")
plt.title("Actual vs Predicted Quality")
plt.show()

**Decision Trees model**

In [58]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import numpy as np

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [46]:
# i used decisiontreeregressor because im solving regression problems
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [55]:
test_pred = model.predict(X_test)

In [59]:
# R² score
r2 = r2_score(y_test, test_pred)
print("R squared error (R²): ", r2)

# MSE (Mean Squared Error)
mse = mean_squared_error(y_test, test_pred)
print("Mean Squared Error (MSE): ", mse)

# RMSE (Root Mean Squared Error)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE): ", rmse)

R squared error (R²):  0.04840179919006571
Mean Squared Error (MSE):  0.621875
Root Mean Squared Error (RMSE):  0.7885905147793751


In [44]:
%%capture
plt.scatter(y_test, test_pred)
plt.xlabel("Actual Quality")
plt.ylabel("Predicted Quality")
plt.title("Actual vs Predicted Quality")
plt.show()

**Predicting Red Wine Quality Using Linear Regression: User-Input Model**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore', message='X does not have valid feature names')

# Define the features and target for training
selected_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'pH', 'alcohol']
X = red[selected_features]
Y = red['quality']

# Initialize and fit the scaler (use the same scaler as during training)
scaler = StandardScaler()
X_scaled_selected = scaler.fit_transform(X)

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_scaled_selected, Y, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model (this should already be done, but ensuring it's retrained here)
model_fit = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_fit.fit(x_train, y_train)

# Function to predict wine quality based on user input
def predict_wine_quality():
    # Get input values from the user
    fixed_acidity = float(input("Enter Fixed Acidity: "))
    volatile_acidity = float(input("Enter Volatile Acidity: "))
    citric_acid = float(input("Enter Citric Acid: "))
    residual_sugar = float(input("Enter Residual Sugar: "))
    ph = float(input("Enter pH level: "))
    alcohol = float(input("Enter Alcohol Content: "))

    # Construct the input array for prediction
    features = np.array([[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, ph, alcohol]])

    # Scale the input features using the same scaler
    features_scaled = scaler.transform(features)

    # Predict the quality using the trained model
    prediction = model_fit.predict(features_scaled)[0]

    # Ensure the predicted quality is within the 0-10 range (round to nearest value)
    predicted_quality = round(np.clip(prediction, 0, 10), 2)

    print(f"Predicted Wine Quality (out of 10): {predicted_quality}")

# Call the function to interact with the user
predict_wine_quality()