In [1]:
!pip install --force-reinstall --no-cache-dir numpy==1.23.5 scipy==1.10.1

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting scipy==1.10.1
  Downloading scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m146.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.1/34.1 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scipy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstall

In [1]:
!pip install gensim==4.3.1
!pip install datasets

Collecting gensim==4.3.1
  Downloading gensim-4.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.3.1
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-a

In [2]:
import pandas as pd

balanced_wine_reviews = pd.read_csv('balanced_wine_reviews.csv')
balanced_wine_reviews.head()

Unnamed: 0,variety,description
0,Bordeaux-style Red Blend,This structured wine has 25-year old vines as ...
1,Bordeaux-style Red Blend,This is a great wine. It has all the elements ...
2,Bordeaux-style Red Blend,"This is a fruity, Cabernet Sauvignon-dominated..."
3,Bordeaux-style Red Blend,Merlot makes up three quarters of this wine wi...
4,Bordeaux-style Red Blend,"This wine shows extraction, new wood and firm ..."


## Word2Vec

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

balanced_wine_reviews["tokens"] = balanced_wine_reviews["description"].apply(tokenize_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
from gensim.models import Word2Vec
SEED=42
# Train Word2Vec on your tokenized corpus
w2v_model = Word2Vec(
    sentences=balanced_wine_reviews["tokens"].tolist(),
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    seed=SEED
)

In [5]:
import numpy as np

def get_avg_embedding(tokens, model, vector_size=100):
    valid_words = [word for word in tokens if word in model.wv]
    if not valid_words:
        return np.zeros(vector_size)
    return np.mean([model.wv[word] for word in valid_words], axis=0)

# Create embedding matrix
X_vectors = np.vstack([
    get_avg_embedding(tokens, w2v_model, vector_size=100)
    for tokens in balanced_wine_reviews["tokens"]
])

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_vectors,
    balanced_wine_reviews["variety"],
    test_size=0.2,
    random_state=SEED,
    stratify=balanced_wine_reviews["variety"]
)

clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.52      0.58      0.55       200
      Cabernet Sauvignon       0.49      0.36      0.42       200
              Chardonnay       0.52      0.54      0.53       200
                  Merlot       0.40      0.30      0.34       200
              Pinot Noir       0.38      0.34      0.36       200
               Red Blend       0.48      0.50      0.49       200
                Riesling       0.53      0.67      0.59       200
                    Rosé       0.52      0.55      0.53       200
         Sauvignon Blanc       0.48      0.42      0.45       200
                   Syrah       0.39      0.47      0.42       200

                accuracy                           0.47      2000
               macro avg       0.47      0.47      0.47      2000
            weighted avg       0.47      0.47      0.47      2000

