# Wines classification model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## load and read data 

In [2]:
# Load the dataset with error handling to skip problematic rows
try:
    data = pd.read_csv("../winemag-data.csv")
except pd.errors.ParserError:
    data = pd.read_csv("../winemag-data.csv", error_bad_lines=False)

## Drop rows with missing values in the 'description' and 'points' columns

In [4]:

data = data.dropna(subset=['description', 'points'])

## Split the data into features (X) and target variable (y)

In [5]:

X = data['description']
y = data['points']

## Split the data into training and testing sets

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Convert text data into numerical features using TF-IDF vectorization

In [7]:
# Convert text data into numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

## Train a Multinomial Naive Bayes classifier

In [8]:

clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)


## Evaluate model

In [10]:
# Predict the test set labels
y_pred = clf.predict(X_test_vectorized)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.21834968263127524

Classification Report:
              precision    recall  f1-score   support

          80       0.00      0.00      0.00        76
          81       0.00      0.00      0.00       134
          82       0.25      0.01      0.01       397
          83       0.26      0.03      0.05       615
          84       0.22      0.11      0.15      1306
          85       0.21      0.11      0.14      1908
          86       0.20      0.14      0.17      2466
          87       0.23      0.44      0.30      3407
          88       0.22      0.41      0.29      3388
          89       0.26      0.08      0.12      2504
          90       0.21      0.40      0.27      3022
          91       0.21      0.15      0.18      2229
          92       0.21      0.13      0.16      1932
          93       0.26      0.05      0.08      1314
          94       0.40      0.01      0.01       795
          95       0.00      0.00      0.00       311
          96       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
