# Wine Quality Prediction


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = pd.read_csv('WineQT.csv')


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 107.3 KB


In [None]:
df["quality"].value_counts()

Unnamed: 0_level_0,count
quality,Unnamed: 1_level_1
5,483
6,462
7,143
4,33
8,16
3,6


In [None]:
# Step 3: Define Features and Target
X = df.drop('quality', axis=1)
y = df['quality']



In [None]:
y.value_counts()

Unnamed: 0_level_0,count
quality,Unnamed: 1_level_1
5,483
6,462
7,143
4,33
8,16
3,6


In [None]:
X,y

(      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
 0               7.4             0.700         0.00             1.9      0.076   
 1               7.8             0.880         0.00             2.6      0.098   
 2               7.8             0.760         0.04             2.3      0.092   
 3              11.2             0.280         0.56             1.9      0.075   
 4               7.4             0.700         0.00             1.9      0.076   
 ...             ...               ...          ...             ...        ...   
 1138            6.3             0.510         0.13             2.3      0.076   
 1139            6.8             0.620         0.08             1.9      0.068   
 1140            6.2             0.600         0.08             2.0      0.090   
 1141            5.9             0.550         0.10             2.2      0.062   
 1142            5.9             0.645         0.12             2.0      0.075   
 
       free su

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
quality,Unnamed: 1_level_1
5,386
6,370
7,114
4,26
8,13
3,5


In [None]:
# Define Preprocessor
# Column Transformer for Scaling
scaler = ColumnTransformer(
    transformers=[('scaler', StandardScaler(), X.columns)],
    remainder='passthrough'
)


In [None]:
scaler

In [None]:
def objective(trial):
    # Choose the algorithm
    algo = trial.suggest_categorical("algo", ["rf", "dt"])

    # Define hyperparameters based on the algorithm
    if algo == "rf":
        n_estimators = trial.suggest_int("n_estimators", 50, 500)
        max_depth = trial.suggest_int("max_depth", 5, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2', None])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42
        )

    elif algo == "dt":
        max_depth = trial.suggest_int("max_depth", 5, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

        model = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )
    # Create Pipeline with SMOTE
    pipeline = Pipeline([
        ('scaler', scaler),
        ('smote', SMOTE(sampling_strategy='auto',k_neighbors=2, random_state=42)),
        ('classifier', model)
    ])

    # Perform cross-validation using F1 Score
    cv_results = cross_validate(
        estimator=pipeline,
        X=X_train,
        y=y_train,
        cv=3,
        scoring="f1_weighted",
        return_train_score=True
    )

    # Get test and training scores
    cv_score = cv_results["test_score"].mean()
    train_score = cv_results["train_score"].mean()

    # Track training error using Optuna attributes
    trial.set_user_attr("training error", train_score)

    return cv_score


In [None]:
from sklearn.neighbors import KNeighborsClassifier

def objective(trial):
    # Choose the algorithm
    algo = trial.suggest_categorical("algo", ["rf", "dt", "knn"])

    # Define hyperparameters based on the algorithm
    if algo == "rf":
        n_estimators = trial.suggest_int("n_estimators", 50, 500)
        max_depth = trial.suggest_int("max_depth", 5, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2', None])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42
        )

    elif algo == "dt":
        max_depth = trial.suggest_int("max_depth", 5, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

        model = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    elif algo == "knn":
        n_neighbors = trial.suggest_int("n_neighbors", 3, 15)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        metric = trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"])

        model = KNeighborsClassifier(
            n_neighbors=n_neighbors,
            weights=weights,
            metric=metric
        )

    # Create Pipeline with SMOTE
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)),
        ('classifier', model)
    ])

    # Perform cross-validation using F1 Score
    cv_results = cross_validate(
        estimator=pipeline,
        X=X_train,
        y=y_train,
        cv=3,
        scoring="f1_weighted",
        return_train_score=True
    )

    # Get test and training scores
    cv_score = cv_results["test_score"].mean()
    train_score = cv_results["train_score"].mean()

    # Track training error using Optuna attributes
    trial.set_user_attr("training error", train_score)

    return cv_score


In [None]:
!pip install optuna



In [None]:
import optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best Hyperparameters:", study.best_params)
print("Best Score:", study.best_value)


[I 2025-03-18 15:38:27,904] A new study created in memory with name: no-name-e5fb4e72-39aa-4f6d-ab0e-30d5061f3570
[I 2025-03-18 15:38:29,681] Trial 0 finished with value: 0.5657712764758315 and parameters: {'algo': 'rf', 'n_estimators': 113, 'max_depth': 49, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 0.5657712764758315.
[I 2025-03-18 15:38:29,996] Trial 1 finished with value: 0.4507430972361095 and parameters: {'algo': 'knn', 'n_neighbors': 8, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.5657712764758315.
[I 2025-03-18 15:38:30,136] Trial 2 finished with value: 0.5055906475323573 and parameters: {'algo': 'dt', 'max_depth': 49, 'min_samples_split': 11, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.5657712764758315.
[I 2025-03-18 15:38:30,274] Trial 3 finished with value: 0.515225498280048 and parameters: {'algo': 'dt', 'max_depth': 18, 'min_samples_split': 20, 'min_samples_leaf': 2}. Best is 

Best Hyperparameters: {'algo': 'rf', 'n_estimators': 478, 'max_depth': 28, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best Score: 0.597175809632894


In [None]:
# Extract Best Parameters
best_params = study.best_params
best_params

{'algo': 'rf',
 'n_estimators': 277,
 'max_depth': 22,
 'min_samples_split': 8,
 'min_samples_leaf': 1,
 'max_features': 'sqrt'}

In [None]:
# Choose Final Model
if best_params['algo'] == "rf":
    model = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        class_weight='balanced',
        random_state=42
    )
elif:
    model = DecisionTreeClassifier(
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        class_weight='balanced',
        random_state=42
    )
else algo == "knn":
        n_neighbors = trial.suggest_int("n_neighbors", 3, 15)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        metric = trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"])

        model = KNeighborsClassifier(
            n_neighbors=n_neighbors,
            weights=weights,
            metric=metric
        )
# Final Pipeline
final_pipeline = Pipeline([
    (('scaler', scaler)),
    ('smote', SMOTE(sampling_strategy='auto',k_neighbors=2, random_state=42)),
    ('classifier', model)
])

# Train
final_pipeline.fit(X_train, y_train)

In [None]:
# Predict on test data
y_pred = final_pipeline.predict(X_test)

# Evaluate model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy Score: 0.6375545851528385

Classification Report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         7
           5       0.71      0.77      0.74        97
           6       0.66      0.59      0.62        92
           7       0.47      0.55      0.51        29
           8       0.33      0.33      0.33         3

    accuracy                           0.64       229
   macro avg       0.36      0.37      0.37       229
weighted avg       0.63      0.64      0.63       229



In [None]:
import pickle

# Save the model
with open('final_model1.pkl', 'wb') as file:
    pickle.dump(final_pipeline, file)
print("Model saved as 'final_model.pkl'")


Model saved as 'final_model.pkl'


In [None]:
# Load the saved model
with open('final_model1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Make predictions using the loaded model
sample_data = X_test.iloc[:10]
actual_data = y_test.iloc[:10]
print("Sample Data:")
print(sample_data)
print("\nActual Data:")
print(actual_data)
predictions = loaded_model.predict(sample_data)

print("Predictions on sample data:", predictions)


Sample Data:
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
96              5.2              0.34         0.00             1.8      0.050   
442             8.8              0.60         0.29             2.2      0.098   
872             7.6              0.43         0.29             2.1      0.075   
764             6.8              0.48         0.08             1.8      0.074   
570             7.7              0.60         0.00             2.6      0.055   
1003            6.4              0.47         0.40             2.4      0.071   
1035            6.5              0.90         0.00             1.6      0.052   
264             9.1              0.28         0.48             1.8      0.067   
391            10.6              0.31         0.49             2.5      0.067   
876             8.1              0.73         0.00             2.5      0.081   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
96                

In [None]:
final_pipeline.feature_names_in_