<a href="https://colab.research.google.com/github/sergekamanzi/wine_quality/blob/main/red_wine_quality_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**importing PANDAS for my dataset**

In [46]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [47]:
#%%capture
# Load datasets for red and white wines
red = pd.read_csv('/content/winequality-red.csv', sep=';')
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [8]:
%%capture
red.describe()

In [9]:
%%capture
red.isnull().sum()

In [10]:
%%capture
red.info()

**Statistical Summary and Feature Correlation of Red Wine Data**

In [45]:
%%capture
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation heatmap to see relationships between features
plt.figure(figsize=(10, 8))
sns.heatmap(red.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Red Wine Features')
plt.show()


**standardize my datas**



In [44]:
from sklearn.preprocessing import StandardScaler


# Features that need scaling
X=red.drop(['quality'], axis=1)
Y=red['quality']


# Initialize the scaler
scaler = StandardScaler()


# Fit and transform the features
X_scaled = scaler.fit_transform(X)


# Now X_scaled contains the standardized features


In [13]:
# Check the shape of the NumPy arrays
X.shape,Y.shape

((1599, 11), (1599,))

**Linear regression using gradient descent**

In [43]:
%%capture
import numpy as np

#Gradient Descent Linear Regression

# Add an intercept term (bias) to the feature matrix
X_scaled = np.c_[np.ones(X_scaled.shape[0]), X_scaled]  # Add a column of ones for the intercept term

def compute_cost(X, y, theta):
    """Compute the cost function for linear regression."""
    m = len(y)  # number of training examples
    predictions = X.dot(theta)  # predicted values
    cost = (1 / (2 * m)) * np.sum(np.square(predictions - y))  # MSE
    return cost

def gradient_descent(X, y, theta, learning_rate, num_iters):
    """Perform gradient descent to learn theta values."""
    m = len(y)
    cost_history = np.zeros(num_iters)

    for i in range(num_iters):
        gradients = (1 / m) * X.T.dot(X.dot(theta) - y)
        theta = theta - learning_rate * gradients  # update theta
        cost_history[i] = compute_cost(X, y, theta)  # store cost for each iteration

    return theta, cost_history

# Initialize parameters
m, n = X_scaled.shape  # number of examples and features
theta = np.zeros(n)  # initialize theta (parameters) to zeros

# Set hyperparameters
learning_rate = 0.01
num_iters = 1000

#Run gradient descent
theta_optimized, cost_history = gradient_descent(X_scaled, Y, theta, learning_rate, num_iters)

#Plot cost history (to check if the cost is reducing over iterations)
plt.figure(figsize=(10, 6))
plt.plot(range(num_iters), cost_history, color='blue')
plt.title('Cost Reduction over Iterations (Learning Curve)')
plt.xlabel('Number of Iterations')
plt.ylabel('Cost (MSE)')
plt.grid(True)
plt.show()

# Output the optimized theta values and final cost
theta_optimized, cost_history[-1]  # Final optimized parameters and final cost


**random forest model**

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
import numpy as np

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [17]:
model_fit= RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_fit.fit(x_train, y_train)

In [18]:
test_pred = model_fit.predict(x_test)

In [24]:
# R squared error
error_score = r2_score(y_test, test_pred)
print("R squared error: ", error_score)

# Mean Squared Error (MSE)
rf_mse = mean_squared_error(y_test, test_pred)
print("Mean Squared Error of rf (MSE): ", rf_mse)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(rf_mse)
print("Root Mean Squared Error (RMSE): ", rmse)


R squared error:  0.5194630213099143
Mean Squared Error of rf (MSE):  0.3140337312203306
Root Mean Squared Error (RMSE):  0.5603871262085975


In [25]:
%%capture
plt.scatter(y_test, test_pred)
plt.xlabel("Actual Quality")
plt.ylabel("Predicted Quality")
plt.title("Actual vs Predicted Quality for training")
plt.show()

**Linear Regression model**

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [28]:
model = LinearRegression()
model.fit(X_train, y_train)

In [29]:
test_pred = model.predict(X_test)

In [31]:
# R-squared error
error_score = metrics.r2_score(y_test, test_pred)
print("R squared error:", error_score)

# Mean Squared Error (MSE)
lr_mse = metrics.mean_squared_error(y_test, test_pred)
print("Mean Squared Error of lr(MSE):", lr_mse)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(lr_mse)
print("Root Mean Squared Error (RMSE):", rmse)

R squared error: 0.403180341279622
Mean Squared Error of lr(MSE): 0.39002514396395493
Root Mean Squared Error (RMSE): 0.624519930798013


In [32]:
%%capture
plt.scatter(y_test, test_pred)
plt.xlabel("Actual Quality")
plt.ylabel("Predicted Quality")
plt.title("Actual vs Predicted Quality")
plt.show()

**Decision Trees model**

In [34]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [36]:
# i used decisiontreeregressor because im solving regression problems
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [37]:
test_pred = model.predict(X_test)

In [39]:
# R² score
r2 = r2_score(y_test, test_pred)
print("R squared error (R²): ", r2)

# MSE (Mean Squared Error)
dt_mse = mean_squared_error(y_test, test_pred)
print("Mean Squared Error (MSE): ", dt_mse)

# RMSE (Root Mean Squared Error)
rmse = np.sqrt(dt_mse)
print("Root Mean Squared Error (RMSE): ", rmse)

R squared error (R²):  0.08187510273614373
Mean Squared Error (MSE):  0.6
Root Mean Squared Error (RMSE):  0.7745966692414834


In [40]:
%%capture
plt.scatter(y_test, test_pred)
plt.xlabel("Actual Quality")
plt.ylabel("Predicted Quality")
plt.title("Actual vs Predicted Quality")
plt.show()

**saving the best model using joblib library**

In [42]:
import joblib  # Import joblib to save the model

# Dictionary storing the models and their MSE values (assuming this is precomputed)
model_performance = {
    'Linear Regression': lr_mse,  # Precomputed MSE for Linear Regression
    'Decision Tree': dt_mse,      # Precomputed MSE for Decision Tree
    'Random Forest': rf_mse       # Precomputed MSE for Random Forest
}

# Find the best performing model based on MSE
best_model_name = min(model_performance, key=model_performance.get)
best_model_mse = model_performance[best_model_name]

print(f"Best Performing Model: {best_model_name} with MSE: {best_model_mse}")

# Save the best model based on its name
if best_model_name == 'Linear Regression':
    joblib.dump(lr_model, 'best_model.pkl')
elif best_model_name == 'Decision Tree':
    joblib.dump(dt_model, 'best_model.pkl')
elif best_model_name == 'Random Forest':
    joblib.dump(rf_model, 'best_model.pkl')

print(f"{best_model_name} model saved as 'best_model.pkl'.")


Best Performing Model: Random Forest with MSE: 0.3140337312203306
Random Forest model saved as 'best_model.pkl'.


**Predicting Red Wine Quality Using Linear Regression: User-Input Model**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore', message='X does not have valid feature names')

# Define the features and target for training
selected_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'pH', 'alcohol']
X = red[selected_features]
Y = red['quality']

# Initialize and fit the scaler (use the same scaler as during training)
scaler = StandardScaler()
X_scaled_selected = scaler.fit_transform(X)

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_scaled_selected, Y, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model (this should already be done, but ensuring it's retrained here)
model_fit = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_fit.fit(x_train, y_train)

# Function to predict wine quality based on user input
def predict_wine_quality():
    # Get input values from the user
    fixed_acidity = float(input("Enter Fixed Acidity: "))
    volatile_acidity = float(input("Enter Volatile Acidity: "))
    citric_acid = float(input("Enter Citric Acid: "))
    residual_sugar = float(input("Enter Residual Sugar: "))
    ph = float(input("Enter pH level: "))
    alcohol = float(input("Enter Alcohol Content: "))

    # Construct the input array for prediction
    features = np.array([[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, ph, alcohol]])

    # Scale the input features using the same scaler
    features_scaled = scaler.transform(features)

    # Predict the quality using the trained model
    prediction = model_fit.predict(features_scaled)[0]

    # Ensure the predicted quality is within the 0-10 range (round to nearest value)
    predicted_quality = round(np.clip(prediction, 0, 10), 2)

    print(f"Predicted Wine Quality (out of 10): {predicted_quality}")

# Call the function to interact with the user
predict_wine_quality()