# Word Embeddings
    More details in the official documentation: https://radimrehurek.com/gensim/auto_examples/index.html#documentation

In [1]:
# !pip install gensim --upgrade
# !pip install numpy --upgrade

In [2]:
import gensim
print(gensim.__version__)

4.1.2


In [3]:
from gensim import downloader
import numpy as np

## Loading The Pretrained Weights
Supported options are at https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

In [4]:
WORD_2_VEC_PATH = 'word2vec-google-news-300'
GLOVE_PATH = 'glove-twitter-200'

In [5]:
glove = downloader.load(GLOVE_PATH)

## Using The Pre-Trained Vecotors

In [6]:
sen = "i am a student at the technion"
representation = []
for word in sen.split():
    if word not in glove.key_to_index:
        print(f"{word} not an existing word in the model")
        continue
    vec = glove[word]
    representation.append(vec)
representation = np.asarray(representation)
print(representation.shape)

technion not an existing word in the model
(6, 200)


# Training A Model

In [7]:
import re

In [8]:
TEXT_PATH = 'Alice_book'
with open(TEXT_PATH, 'r', encoding='utf-8') as f:
    sentences = f.readlines()
sentences = [sen.strip().lower() for sen in sentences]
sentences = [sen.split() for sen in sentences if sen]
sentences = [[re.sub(r'\W+', '', w) for w in sen] for sen in sentences]
sentences[0:2]

[['chapter', 'i', 'down', 'the', 'rabbithole'],
 ['alice',
  'was',
  'beginning',
  'to',
  'get',
  'very',
  'tired',
  'of',
  'sitting',
  'by',
  'her',
  'sister',
  'on',
  'the']]

In [9]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=sentences, vector_size=10, window=2, min_count=1, workers=4, epochs=100, seed=42)
model.save("word2vec.model")

In [10]:
sims = model.wv.most_similar('alice', topn=5)
sims

[('so', 0.8331175446510315),
 ('very', 0.8064254522323608),
 ('late', 0.789730966091156),
 ('mouse', 0.7821938395500183),
 ('yet', 0.7783822417259216)]

In [11]:
sims = model.wv.most_similar('annoyed', topn=5)
sims

[('meanwhile', 0.9836915135383606),
 ('leaders', 0.9790331125259399),
 ('expected', 0.8927791714668274),
 ('nothing', 0.8861377835273743),
 ('hasnt', 0.8848412036895752)]

In [12]:
sims = glove.most_similar('annoyed', topn=5)
sims

[('pissed', 0.8485370874404907),
 ('irritated', 0.8377761840820312),
 ('frustrated', 0.7810536026954651),
 ('annoying', 0.757415771484375),
 ('upset', 0.7419010996818542)]

# Some Nice Properties

In [13]:
glove.most_similar('program', topn=5)

[('programs', 0.6853476762771606),
 ('seminar', 0.6410128474235535),
 ('training', 0.6214897036552429),
 ('workshop', 0.591772735118866),
 ('system', 0.5909943580627441)]

In [14]:
glove.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6820898056030273),
 ('prince', 0.5875527262687683),
 ('princess', 0.5620488524436951),
 ('royal', 0.5522865653038025),
 ('mother', 0.5362966656684875)]

In [15]:
glove.most_similar(positive=['paris','germany'], negative=['berlin'], topn = 5)

[('france', 0.7369073033332825),
 ('spain', 0.6768407225608826),
 ('portugal', 0.6567486524581909),
 ('italy', 0.6421884298324585),
 ('denmark', 0.6146384477615356)]

In [16]:
glove.most_similar(positive=['walking','swam'], negative=['swimming'], topn = 5)

[('walked', 0.5864155888557434),
 ('drove', 0.5215498805046082),
 ('ran', 0.5134605169296265),
 ('sprinted', 0.4759795069694519),
 ('stood', 0.47308677434921265)]

## Sloving NLP Problems - Sentiment Analysis

In [17]:
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
class SentimentDataSet(Dataset):

    def __init__(self, file_path, vector_type, tokenizer=None):
        self.file_path = file_path
        data = pd.read_csv(self.file_path)
        data['label'] = data['label'].replace({'Positive': 1, 'Negative': 0})
        self.sentences = data['reviewText'].tolist()
        self.labels = data['label'].tolist()
        self.tags_to_idx = {tag: idx for idx, tag in enumerate(sorted(list(set(self.labels))))}
        self.idx_to_tag = {idx: tag for tag, idx in self.tags_to_idx.items()}
        self.vector_type = vector_type
        if vector_type == 'tf-idf':
            if tokenizer is None:
                self.tokenizer = TfidfVectorizer(lowercase=True, stop_words=None)
                self.tokenized_sen = self.tokenizer.fit_transform(self.sentences)
            else:
                self.tokenizer = tokenizer
                self.tokenized_sen = self.tokenizer.transform(self.sentences)
            self.vector_dim = len(self.tokenizer.vocabulary_)
        else:
            if vector_type == 'w2v':
                model = downloader.load(WORD_2_VEC_PATH)
            elif vector_type == 'glove':
                model = downloader.load(GLOVE_PATH)
            else:
                raise KeyError(f"{vector_type} is not a supported vector type")
            representation, labels = [], []
            for sen, cur_labels in zip(self.sentences, self.labels):
                cur_rep = []
                for word in sen.split():
                    word = re.sub(r'\W+', '', word.lower())
                    if word not in model.key_to_index:
                        continue
                    vec = model[word]
                    cur_rep.append(vec)
                if len(cur_rep) == 0:
                    print(f'Sentence {sen} cannot be represented!')
                    continue
                cur_rep = np.stack(cur_rep).mean(axis=0)  # HW TODO: change to token level classification
                representation.append(cur_rep)
                labels.append(cur_labels)
            self.labels = labels
            representation = np.stack(representation)
            self.tokenized_sen = representation
            self.vector_dim = representation.shape[-1]

    def __getitem__(self, item):
        cur_sen = self.tokenized_sen[item]
        if self.vector_type == 'tf-idf':
            cur_sen = torch.FloatTensor(cur_sen.toarray()).squeeze()
        else:
            cur_sen = torch.FloatTensor(cur_sen).squeeze()
        label = self.labels[item]
        label = self.tags_to_idx[label]
        data = {"input_ids": cur_sen, "labels": label}
        return data

    def __len__(self):
        return len(self.labels)

In [19]:
from torch import nn


class SentimentNN(nn.Module):

    def __init__(self, vec_dim, num_classes, hidden_dim=100):
        super(SentimentNN, self).__init__()
        self.first_layer = nn.Linear(vec_dim, hidden_dim)
        self.second_layer = nn.Linear(hidden_dim, num_classes)
        self.activation = nn.ReLU()
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        x = self.first_layer(input_ids)
        x = self.activation(x)
        x = self.second_layer(x)
        if labels is None:
            return x, None
        loss = self.loss(x, labels)
        return x, loss

In [20]:
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader


def train(model, data_sets, optimizer, num_epochs: int, batch_size=16):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_loaders = {"train": DataLoader(data_sets["train"], batch_size=batch_size, shuffle=True),
                    "test": DataLoader(data_sets["test"], batch_size=batch_size, shuffle=False)}
    model.to(device)

    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch + 1}/{num_epochs}')
        print('-' * 10)

        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            labels, preds = [], []

            for batch in data_loaders[phase]:
                batch_size = 0
                for k, v in batch.items():
                    batch[k] = v.to(device)
                    batch_size = v.shape[0]

                optimizer.zero_grad()
                if phase == 'train':
                    outputs, loss = model(**batch)
                    loss.backward()
                    optimizer.step()
                else:
                    with torch.no_grad():
                        outputs, loss = model(**batch)
                pred = outputs.argmax(dim=-1).clone().detach().cpu()
                labels += batch['labels'].cpu().view(-1).tolist()
                preds += pred.view(-1).tolist()
                running_loss += loss.item() * batch_size

            epoch_loss = running_loss / len(data_sets[phase])
            epoch_acc = accuracy_score(labels, preds)

            epoch_acc = round(epoch_acc, 5)

            if phase.title() == "test":
                print(f'{phase.title()} Loss: {epoch_loss:.4e} Accuracy: {epoch_acc}')
            else:
                print(f'{phase.title()} Loss: {epoch_loss:.4e} Accuracy: {epoch_acc}')
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                with open('model.pkl', 'wb') as f:
                    torch.save(model, f)
        print()

    print(f'Best Validation Accuracy: {best_acc:4f}')


In [21]:
from torch.optim import Adam

### TF-IDF Model

In [22]:
from sklearn.model_selection import train_test_split

train_ds = SentimentDataSet('amazon_sa/train.csv', vector_type='tf-idf')
print('created train')
test_ds = SentimentDataSet('amazon_sa/test.csv', vector_type='tf-idf', tokenizer=train_ds.tokenizer)
datasets = {"train": train_ds, "test": test_ds}
nn_model = SentimentNN(num_classes=2, vec_dim=train_ds.vector_dim)
optimizer = Adam(params=nn_model.parameters())
train(model=nn_model, data_sets=datasets, optimizer=optimizer, num_epochs=5)

created train
Epoch 1/5
----------
Train Loss: 3.4617e-01 Accuracy: 0.84957
Test Loss: 3.0488e-01 Accuracy: 0.86907

Epoch 2/5
----------
Train Loss: 2.0458e-01 Accuracy: 0.92113
Test Loss: 3.4951e-01 Accuracy: 0.85601

Epoch 3/5
----------
Train Loss: 1.3632e-01 Accuracy: 0.95003
Test Loss: 4.0333e-01 Accuracy: 0.84577

Epoch 4/5
----------
Train Loss: 8.8921e-02 Accuracy: 0.97016
Test Loss: 5.2375e-01 Accuracy: 0.84227

Epoch 5/5
----------
Train Loss: 5.4734e-02 Accuracy: 0.98208
Test Loss: 6.1827e-01 Accuracy: 0.83796

Best Validation Accuracy: 0.869070


### Word to Vec Model

In [23]:
train_ds = SentimentDataSet('amazon_sa/train.csv', vector_type='w2v')
print('created train')
test_ds = SentimentDataSet('amazon_sa/test.csv', vector_type='w2v')
datasets = {"train": train_ds, "test": test_ds}
nn_model = SentimentNN(num_classes=2, vec_dim=train_ds.vector_dim)
optimizer = Adam(params=nn_model.parameters())
train(model=nn_model, data_sets=datasets, optimizer=optimizer, num_epochs=5)

Sentence ngtkl;jrgsh'tldfk lsdfmkhgk lknfgh lkjfgnh'lk kdfngh'lkn sfdngylekn lkjtdhlk lkthtk lksdhyklt klfntuhl;rkj lskdfhlk;j lkrftghlk lk'srthlk sfkhlk lkngtrhlk fghklnlk sgkjhbkj lkrlkhk rlkhjlk sdflghkj cannot be represented!
Sentence vjnmhrg;h lndsfg;okhi d;jfnhgtoeruihj lkjsdlfio jajghoshi khsdtrhjo'i jlakhdgrkjh klagtrjkl'j lkdsnrtylorhjo ijlektj blkjtrl'ykhje'kj lknsfhytrlkhn ljknsghjytr'lkj lkjstl'rkyjl'kj lklgtkjhytrlj lkjtrylkrjlkj ltkjyrkjd lkftrylkrjl jlkjtrylrkgj fhj cannot be represented!
created train
Sentence Zzzzzzzzzzz! cannot be represented!
Epoch 1/5
----------
Train Loss: 3.9117e-01 Accuracy: 0.81602
Test Loss: 3.6184e-01 Accuracy: 0.83268

Epoch 2/5
----------
Train Loss: 3.4927e-01 Accuracy: 0.84188
Test Loss: 3.5507e-01 Accuracy: 0.84036

Epoch 3/5
----------
Train Loss: 3.4362e-01 Accuracy: 0.8437
Test Loss: 3.5076e-01 Accuracy: 0.84184

Epoch 4/5
----------
Train Loss: 3.4036e-01 Accuracy: 0.84646
Test Loss: 3.4979e-01 Accuracy: 0.84373

Epoch 5/5
----------


### GloVe Model


In [24]:
train_ds = SentimentDataSet('amazon_sa/train.csv', vector_type='glove')
print('created train')
test_ds = SentimentDataSet('amazon_sa/test.csv', vector_type='glove')
datasets = {"train": train_ds, "test": test_ds}
nn_model = SentimentNN(num_classes=2, vec_dim=train_ds.vector_dim)
optimizer = Adam(params=nn_model.parameters())
train(model=nn_model, data_sets=datasets, optimizer=optimizer, num_epochs=5)

Sentence ngtkl;jrgsh'tldfk lsdfmkhgk lknfgh lkjfgnh'lk kdfngh'lkn sfdngylekn lkjtdhlk lkthtk lksdhyklt klfntuhl;rkj lskdfhlk;j lkrftghlk lk'srthlk sfkhlk lkngtrhlk fghklnlk sgkjhbkj lkrlkhk rlkhjlk sdflghkj cannot be represented!
created train
Epoch 1/5
----------
Train Loss: 3.8406e-01 Accuracy: 0.8236
Test Loss: 3.7604e-01 Accuracy: 0.82907

Epoch 2/5
----------
Train Loss: 3.5778e-01 Accuracy: 0.83647
Test Loss: 3.6485e-01 Accuracy: 0.83432

Epoch 3/5
----------
Train Loss: 3.5267e-01 Accuracy: 0.84047
Test Loss: 3.6361e-01 Accuracy: 0.83688

Epoch 4/5
----------
Train Loss: 3.4851e-01 Accuracy: 0.84094
Test Loss: 3.6156e-01 Accuracy: 0.83621

Epoch 5/5
----------
Train Loss: 3.4593e-01 Accuracy: 0.84347
Test Loss: 3.6611e-01 Accuracy: 0.83446

Best Validation Accuracy: 0.836880
