# Predict the reviewer of a Wine review based on the description with TFIDF

## Load and prepare data

In [1]:
import pandas as pd

df = pd.read_csv('data/winemag-data-130k-v2.csv')

In [2]:
# split points into binary label (80-89 = bad, 90-99 = good)
df['label'] = df['points'].apply(lambda x: 'good' if x > 89 else 'bad')

In [3]:
# find top 2 reviewers
df.taster_name.value_counts()

Roger Voss            25514
Michael Schachner     15134
Kerin O’Keefe         10776
Virginie Boone         9537
Paul Gregutt           9532
Matt Kettmann          6332
Joe Czerwinski         5147
Sean P. Sullivan       4966
Anna Lee C. Iijima     4415
Jim Gordon             4177
Anne Krebiehl MW       3685
Lauren Buzzeo          1835
Susan Kostrzewa        1085
Mike DeSimone           514
Jeff Jenssen            491
Alexander Peartree      415
Carrie Dykes            139
Fiona Adams              27
Christina Pickard         6
Name: taster_name, dtype: int64

In [4]:
filtered_df = df.query('taster_name == "Roger Voss" | taster_name == "Michael Schachner"')

filtered_df['taster_name']

1                Roger Voss
5         Michael Schachner
7                Roger Voss
9                Roger Voss
11               Roger Voss
                ...        
129964           Roger Voss
129965           Roger Voss
129968           Roger Voss
129969           Roger Voss
129970           Roger Voss
Name: taster_name, Length: 40648, dtype: object

In [5]:
from sklearn.model_selection import train_test_split

X = filtered_df["description"]
y = filtered_df["taster_name"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("model", MultinomialNB())])
predictor = pipe.fit(X_train, y_train)

In [7]:
from sklearn import metrics

predictions = pipe.predict(X_test)

y_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

                   precision    recall  f1-score   support

Michael Schachner       1.00      1.00      1.00      3017
       Roger Voss       1.00      1.00      1.00      5113

         accuracy                           1.00      8130
        macro avg       1.00      1.00      1.00      8130
     weighted avg       1.00      1.00      1.00      8130

Accuracy: 0.9986469864698647
