# Training word vectors using Skip Gram with negative sampling

In [2]:
import torch
import torch.nn as nn
import numpy as np
import csv
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re
from torchtext.vocab import build_vocab_from_iterator
# import scipy sparse matrix to use scipy.sparse.linalg.svds
import scipy.sparse as sp
import scipy.sparse.linalg as linalg
from tqdm import tqdm
from preprocess import Preprocess

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home2/sanika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home2/sanika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# load textual data from csv file as a list of strings
def load_data(file_path):
    # load only the second column of the csv file
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row[1])
    # delete the first element of the list (header)
    del data[0]
    return data

In [5]:
data = load_data('./ANLP-2/train.csv')

In [6]:
indexed_data, vocab, tokenized_data = Preprocess(data, True, 1)()

In [7]:
# make pairs of positive and negative samples
def make_pairs(data, window_size):
    pairs = []
    for sentence in tqdm(data):
        for i in range(len(sentence)):
            for j in range(i - window_size, i + window_size + 1):
                if j < 0 or j >= len(sentence) or i == j:
                    continue
                pairs.append((sentence[i], sentence[j]))
    return pairs

def make_negative_pairs(data, vocab, num_negative_samples):
    pairs = []
    for sentence in tqdm(data):
        for i in range(len(sentence)):
            for j in range(num_negative_samples):
                rand_index = np.random.randint(len(vocab))
                pairs.append((sentence[i], rand_index))
    return pairs

# concatenate and add output labels
def make_dataset(data, vocab, window_size, num_negative_samples):
    positive_pairs = make_pairs(data, window_size)
    # remove duplicates
    positive_pairs = list(set(positive_pairs))
    negative_pairs = make_negative_pairs(data, vocab, num_negative_samples)
    # remove duplicates
    negative_pairs = list(set(negative_pairs))
    # remove pairs from negative pairs that are in positive pairs
    dataset = {}
    for pair in positive_pairs:
        dataset[pair] = 1
    for pair in tqdm(negative_pairs):
        if pair not in dataset:
            dataset[pair] = 0

    # pick negative pairs out (value = 0)
    negative_pairs = []
    for pair in dataset:
        if dataset[pair] == 0:
            negative_pairs.append(pair)

    # randomly sample negative pairs so that the number of negative pairs is equal to the number of positive pairs
    # get indices of negative pairs
    indices = np.random.choice(len(negative_pairs), len(positive_pairs)*3, replace=False)
    negative_pairs = [negative_pairs[i] for i in indices]
    
    return positive_pairs, negative_pairs



In [8]:
positive_pairs, negative_pairs = make_dataset(indexed_data, vocab, 2, 5)

  3%|▎         | 3892/120000 [00:00<00:06, 19140.32it/s]

100%|██████████| 120000/120000 [00:05<00:00, 20862.52it/s]
100%|██████████| 120000/120000 [01:09<00:00, 1736.21it/s]
100%|██████████| 13027893/13027893 [00:09<00:00, 1405263.01it/s]


In [9]:
len(positive_pairs)

3235938

In [10]:
# create a dataset and dataloader
class Dataset(torch.utils.data.Dataset):
    def __init__(self, positive_pairs, negative_pairs):
        self.positive_pairs = positive_pairs
        self.negative_pairs = negative_pairs
        self.pairs = self.positive_pairs + self.negative_pairs
        self.labels = [1 for _ in range(len(self.positive_pairs))] + [0 for _ in range(len(self.negative_pairs))]
        # convert all the pairs and labels to tensors

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        return self.pairs[index], self.labels[index]

In [11]:
dataset = Dataset(positive_pairs, negative_pairs)

In [12]:
# shuffle the dataset
loaders = {
    'train': torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)
}

In [13]:
# okay so till now i have the indexed data, i have the vocab and the tokenized data
# now, for getting embeddings using skipgram, i need to make pairs of negative and positive samples

class Skip_Gram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Skip_Gram, self).__init__()
        self.target_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim * 2, embedding_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, target, context):
        target_embed = self.target_embedding(target)
        context_embed = self.context_embedding(context)
        # concatenate the embeddings
        embed = torch.cat((target_embed, context_embed), 1)
        out = self.fc(embed)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    
    def fit(self, loaders, optimizer, criterion, n_epochs, device):
        self.to(device)
        self.train()

        for epoch in range(n_epochs):
            self.train()
            total_loss = 0
            correct = 0
            for i, (data, labels) in enumerate(loaders['train']):
                optimizer.zero_grad()
                # convert to tensors
                target = data[0]
                context = data[1]
                target = target.to(device)
                context = context.to(device)
                output = self(target, context)
                # squeeze the output
                output = output.squeeze()
                # convert to float
                labels = labels.to(device, dtype=torch.float32)
                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                correct += torch.sum((output > 0.5) == labels).item()
            print(f'Epoch: {epoch+1}/{n_epochs}, Loss: {total_loss/len(loaders["train"])}')
            print(f'Accuracy: {correct/len(loaders["train"].dataset)}')
                

In [16]:
model = Skip_Gram(len(vocab), 300)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
n_epochs = 5

model.fit(loaders, optimizer, criterion, n_epochs, device)

Epoch: 1/5, Loss: 0.317218183780918
Accuracy: 0.8692236995888055
Epoch: 2/5, Loss: 0.30238448707185345
Accuracy: 0.8745051666626493
Epoch: 3/5, Loss: 0.296065924534819
Accuracy: 0.8776670010364847
Epoch: 4/5, Loss: 0.2891443773657926
Accuracy: 0.8811608875077335
Epoch: 5/5, Loss: 0.2832462140417006
Accuracy: 0.8838778740507389


In [17]:
# save target embeddings as a pickle file 
embeddings_dict = {}
for idx, embedding in enumerate(model.target_embedding.weight):
    embeddings_dict[idx] = embedding.to('cpu').detach().numpy()


In [18]:
# save as pt
torch.save(embeddings_dict, 'skip-gram-vocab.pt')

In [19]:
# save vocab
torch.save(vocab, 'skip-gram-vocab.pt')