In [1]:
pip install textblob




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('WineQT.csv')

In [4]:

print("Dataset shape:", data.shape)
print("\nFirst few rows:")
print(data.head())
print("\nColumn names:")
print(data.columns.tolist())
print("\nQuality distribution:")
print(data['quality'].value_counts().sort_index())

Dataset shape: (1143, 13)

First few rows:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Id  
0      9.4       

In [5]:
X = data.drop(['quality', 'Id'], axis=1)
y = data['quality']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)

In [9]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.70

Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.72      0.80      0.76        96
           6       0.68      0.67      0.67        99
           7       0.72      0.69      0.71        26
           8       0.00      0.00      0.00         2

    accuracy                           0.70       229
   macro avg       0.42      0.43      0.43       229
weighted avg       0.68      0.70      0.69       229



In [10]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
                 feature  importance
10               alcohol    0.146276
9              sulphates    0.119159
1       volatile acidity    0.110288
6   total sulfur dioxide    0.101212
7                density    0.090431
4              chlorides    0.082482
0          fixed acidity    0.073160
2            citric acid    0.072021
8                     pH    0.070924
5    free sulfur dioxide    0.068906
3         residual sugar    0.065142


In [11]:
def predict_wine_quality(features):
    """
    Predict wine quality based on input features
    
    Parameters:
    features: list or array with values for:
    [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides,
     free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol]
    """
    features_array = np.array(features).reshape(1, -1)
    prediction = model.predict(features_array)
    return prediction[0]

In [12]:
example_wine = [7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4]
predicted_quality = predict_wine_quality(example_wine)
print(f"\nExample prediction - Predicted quality: {predicted_quality}")


Example prediction - Predicted quality: 5
