# Predict the quality of Wine based on the description with TFIDF


## Load data

In [1]:
import pandas as pd

df = pd.read_csv('data/winemag-data-130k-v2.csv')

In [2]:
# split points into binary label (80-89 = bad, 90-99 = good)
df['label'] = df['points'].apply(lambda x: 'good' if x > 89 else 'bad')

## Modelling

In [3]:
from sklearn.model_selection import train_test_split

X = df['description']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('tfidf', TfidfVectorizer()), ('model', MultinomialNB())])
predictor = pipe.fit(X_train, y_train)

In [5]:
from sklearn import metrics

predictions = pipe.predict(X_test)

y_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         bad       0.80      0.92      0.85     16201
        good       0.82      0.62      0.71      9794

    accuracy                           0.81     25995
   macro avg       0.81      0.77      0.78     25995
weighted avg       0.81      0.81      0.80     25995

Accuracy: 0.8061165608770917


This looks quite good. But let's see if we can do better with a word embedding.