In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd drive/MyDrive/NLP

/content/drive/MyDrive/NLP


In [3]:
import numpy as np
import pandas as pd
import torch

import string
import os
import pickle
import torch.nn as nn
import torch.nn.functional as F
import pickle
import gc
import warnings
import time

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from glob import glob
from tqdm import tqdm, trange
from torch import optim
from collections import Counter, defaultdict
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings('ignore')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
lem = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation)) # punctutation
stop_words.update(["''", "``"]) # extra based on corpus

In [9]:
txt_dir='/content/drive/MyDrive/NLP/text'

In [None]:
txt_files = glob(os.path.join(f"{txt_dir}/", "*.txt"))
print(txt_files)

In [None]:
for text in txt_files:
    with open(text,'r+') as f :
        lines,res = f.readlines(),[]
        for line in tqdm(lines, total = len(lines)):
            s_line = sent_tokenize(line)
            for l in s_line : 
                res.append(" ".join([lem.lemmatize(w.lower(), 'v') for w in word_tokenize(l) if w not in stop_words]))
        final = '\n'.join(res)
        f.seek(0)
        f.write(final)

In [10]:
# remove words from the vocabulary with less than a particular frequency 
# create a <unk> token

def gather_word_freqs(collection, sampling_freq):

    vocab, word_idx, idx_word = Counter(),{},{}
    new_vocab = Counter()
    sampling_rate = 0.001
    word_counter = 0
    
    for sent in tqdm(collection, total = len(collection)):
        vocab.update(sent)
      
    for k,v in vocab.items():
        if v >= sampling_freq:
            new_vocab[k] = v
      
    new_vocab["<unk>"] = 0

    for i,(k,v) in enumerate(new_vocab.items()):
        word_idx[k] = i
        idx_word[i] = k
   
    return collection, new_vocab, word_idx, idx_word

def gather_training_data(collection, word_idx, vocab, cs, embed_type):
    training_data = []
    coo_counts = Counter()
    
    for sent in tqdm(collection, total = len(collection)):

        indices = []
        for w in sent:
          if w not in word_idx:
            indices.append(0)
          else:
            indices.append(word_idx[w])
        
        if embed_type == 'skipgram' or embed_type == "glove":
            for i in range(len(indices)):
                for j in range(max(-cs, 0), min(cs + 1, len(indices))):
                    if i == j : 
                        continue 
                    training_data.append((indices[i], indices[j]))
        else : 
            if len(indices) < (2*cs + 1): 
                continue

            for i in range(cs, len(indices) - cs) : 
                context = []
                for j in range(i-cs, i+cs):
                    if i == j : 
                        continue 
                    context.append(indices[j])
                training_data.append((context, indices[i]))

        if embed_type == "glove":
            coo_counts.update(training_data)
            tokens = defaultdict(lambda: -1)
            for word, count in vocab.most_common(len(vocab)):
                if count >= 5 : 
                    tokens[word_idx[word]] = count
            
            training_data = [(w[0],w[1],count) for w, count in coo_counts.items() if tokens[w[0]]> 0 and tokens[w[1]] > 0]
    
    return training_data

def load_data(txt_dir,cs,embed_type):
        
    collection = []
    for txt in txt_dir:
        with open(txt,'r') as f : 
            lines = f.readlines()
            collection += [line.split() for line in tqdm(lines, total = len(lines))]  

    (collection, vocab, word_idx, idx_word) = gather_word_freqs(collection, 100)

    training_data = gather_training_data(collection, word_idx, vocab, cs,embed_type)
    
    return vocab, word_idx, idx_word, training_data

In [11]:
class Skipgram(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(Skipgram, self).__init__()
        # we first start with the lookup(Embedding layer) layer for skipgram
        self.embeddings_input = nn.Embedding(vocab_size,embedding_size)
        self.linear = nn.Linear(embedding_size,vocab_size) #Ax+B layer
        
    def forward(self, input_word):
        embeds = self.embeddings_input(input_word)
        out = self.linear(embeds)
        log_prob = F.log_softmax(out, dim = 1)
        return log_prob

In [12]:
class CBOWModel(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(CBOWModel, self).__init__()
        self.embeddings_input = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, input_word):
        embeds = self.embeddings_input(input_word)
        embeds = torch.sum(embeds, dim = 1)
        out = self.linear(embeds)
        log_prob = F.log_softmax(out, dim = 1)
        return log_prob

In [13]:
class GloVeModel(nn.Module):
    def __init__(self, embedding_size, vocab_size, x_max):
        super(GloVeModel, self).__init__()
        self.x_max = x_max
        self._focal_embeddings = nn.Embedding(vocab_size, embedding_size)
        self._context_embeddings = nn.Embedding(vocab_size, embedding_size)
        self._focal_biases = nn.Embedding(vocab_size,1).type(torch.float64)
        self._context_biases = nn.Embedding(vocab_size,1).type(torch.float64)
        
    def forward(self, focal_input, context_input, coocurrence_count):
        x_max = max(self.x_max, 1)
        focal_embed = self._focal_embeddings(focal_input)
        context_embed = self._context_embeddings(context_input)
        focal_bias = self._focal_biases(focal_input)
        context_bias = self._context_biases(context_input)
        
        weight_factor = torch.pow(coocurrence_count/x_max, 0.75)
        weight_factor[weight_factor>1] = 1
        
        embedding_products = torch.sum(focal_embed*context_embed, dim = 1)
        log_cooccurrences = torch.log(coocurrence_count)
        
        distance_expr = (embedding_products + focal_bias + context_bias + log_cooccurrences)**2
        
        single_losses = weight_factor*distance_expr
        mean_loss = torch.mean(single_losses)
        
        return mean_loss

In [14]:
class NEGLoss(nn.Module):
    def __init__(self, ix_to_word, word_freqs, num_negative_samples=5,):
        super(NEGLoss, self).__init__()
        self.num_negative_samples = num_negative_samples
        self.num_words = len(ix_to_word)
        self.distr = F.normalize(torch.Tensor(
            [word_freqs[ix_to_word[i]] for i in range(len(word_freqs))]).pow(0.75), dim=0
        )

    def sample(self, num_samples, positives=[]):
        weights = torch.zeros((self.num_words, 1))
        for w in positives: weights[w] += 1.0
        for _ in range(num_samples):
            w = torch.multinomial(self.distr, 1)[0]
            while (w in positives):
                w = torch.multinomial(self.distr, 1)[0]
            weights[w] += 1.0
        return weights

    def forward(self, input, target):
        return F.nll_loss(input, target,
            self.sample(self.num_negative_samples, positives=target.cpu().data.numpy()).squeeze().to(device))

In [15]:
def train(txt_dir, epochs = 5, embed_dim = 300, cs = 2, model_type = "skipgram", negative_sampling = False, device = 'cpu'):

  if model_type is "skipgram":
    vocab, word_idx,idx_word,training_data = load_data(txt_dir,cs,"skipgram")
    model = Skipgram(embed_dim, len(vocab))
  elif model_type is "cbow":
    vocab, word_idx,idx_word,training_data = load_data(txt_dir,cs,"cbow")
    model = CBOWModel(embed_dim, len(vocab))
  else:
    vocab, word_idx,idx_word,training_data = load_data(txt_dir,cs,"glove")
    model = GloVeModel(embed_dim, len(vocab), negative_sampling)

  if negative_sampling:
      loss_function = NEGLoss(idx_word, vocab)
  else:
      loss_function = nn.NLLLoss()

  optimizer = optim.SGD(model.parameters(), lr=0.001)

  losses = []

  st = time.time()

  for epoch in range(epochs):
      total_loss = 0
      print("Beginning epoch %d" % epoch)
      print("Length Training Data", len(training_data))

      if model_type == 'glove' : 
        for (focal, context, counts) in training_data:
            context_var = torch.LongTensor([context]).to(device)
            focal = torch.LongTensor([focal]).to(device)
            counts = torch.LongTensor([counts]).to(device)
            model.train()
            optimizer.zero_grad()
            loss = model(focal, context_var, counts)
            loss.backward()
            optimizer.step()
        losses.append(loss.item())
      else:
                
        for context, target in tqdm(training_data, total=len(training_data)):
            context_var = torch.LongTensor([context]).to(device)
            target =  torch.LongTensor([target]).to(device)
            model.to(device)
            model.train()
            optimizer.zero_grad()
            log_probs = model(context_var)
            loss = loss_function(log_probs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss
        losses.append(total_loss)

  torch.save(
        {
            "model_state_dict": model.state_dict(),
            "losses": losses,
            "word_idx": word_idx,
            "idx_word": idx_word,
        },
        f"models/model_{model_type}_neg{negative_sampling}.pth",
  )
  print("MODEL SAVED")

  et = time.time()
  # print("time taken", (et - st))
  return losses

In [20]:
txt_dir = ["opaa"]

In [None]:
train(txt_dir, epochs = 5, embed_dim = 100, cs = 2, model_type = "glove", negative_sampling = False, device = device)

In [None]:
train(txt_dir, epochs = 5, embed_dim = 100, cs = 2, model_type = "glove", negative_sampling = 5, device = device)

In [None]:
train(txt_dir, epochs = 5, embed_dim = 100, cs = 2, model_type = "cbow", negative_sampling = True)

In [21]:
train(txt_dir, epochs = 5, embed_dim = 100, cs = 2, model_type = "skipgram", negative_sampling = True, device = device)

100%|██████████| 16786/16786 [00:00<00:00, 426695.35it/s]
100%|██████████| 16786/16786 [00:00<00:00, 230934.15it/s]
100%|██████████| 16786/16786 [00:00<00:00, 33800.06it/s]


Beginning epoch 0
Length Training Data 965509


100%|██████████| 100/100 [00:06<00:00, 14.33it/s]


Beginning epoch 1
Length Training Data 100


100%|██████████| 100/100 [00:00<00:00, 451.11it/s]


Beginning epoch 2
Length Training Data 100


100%|██████████| 100/100 [00:00<00:00, 453.06it/s]


Beginning epoch 3
Length Training Data 100


100%|██████████| 100/100 [00:00<00:00, 449.83it/s]


Beginning epoch 4
Length Training Data 100


100%|██████████| 100/100 [00:00<00:00, 445.19it/s]

MODEL SAVED





[tensor(554.6580, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(463.8362, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(396.1562, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(360.1760, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(342.4267, device='cuda:0', grad_fn=<AddBackward0>)]

In [None]:
train(txt_dir, epochs = 5, embed_dim = 100, cs = 2, model_type = "cbow", negative_sampling = True, device = device)

100%|██████████| 16786/16786 [00:00<00:00, 411386.96it/s]


87


100%|██████████| 16786/16786 [00:00<00:00, 242344.17it/s]


27777777777777777777777777777777777777777777 246


100%|██████████| 16786/16786 [00:00<00:00, 16903.23it/s]


MODEL DONE
NEGATIVE LOSS DONE
Beginning epoch 0
Length Training Data 276130


100%|██████████| 276130/276130 [10:09<00:00, 453.21it/s]


Beginning epoch 1
Length Training Data 276130


100%|██████████| 276130/276130 [10:11<00:00, 451.36it/s]


Beginning epoch 2
Length Training Data 276130


100%|██████████| 276130/276130 [10:05<00:00, 456.29it/s]


Beginning epoch 3
Length Training Data 276130


100%|██████████| 276130/276130 [10:09<00:00, 453.23it/s]


Beginning epoch 4
Length Training Data 276130


100%|██████████| 276130/276130 [10:12<00:00, 451.06it/s]


MODEL SAVED


[tensor(679761.7500, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(648273.1250, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(641574.6875, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(638099.1875, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(635637.6250, device='cuda:0', grad_fn=<AddBackward0>)]

In [None]:
train(txt_dir, epochs = 5, embed_dim = 100, cs = 2, model_type = "cbow", negative_sampling = False, device = device)

100%|██████████| 16786/16786 [00:00<00:00, 330510.08it/s]


87


100%|██████████| 16786/16786 [00:00<00:00, 185955.13it/s]


27777777777777777777777777777777777777777777 246


100%|██████████| 16786/16786 [00:00<00:00, 18608.66it/s]


MODEL DONE
NEGATIVE LOSS DONE
Beginning epoch 0
Length Training Data 276130


100%|██████████| 276130/276130 [04:18<00:00, 1070.10it/s]


Beginning epoch 1
Length Training Data 276130


100%|██████████| 276130/276130 [04:08<00:00, 1111.96it/s]


Beginning epoch 2
Length Training Data 276130


100%|██████████| 276130/276130 [04:06<00:00, 1122.04it/s]


Beginning epoch 3
Length Training Data 276130


100%|██████████| 276130/276130 [04:06<00:00, 1118.59it/s]


Beginning epoch 4
Length Training Data 276130


100%|██████████| 276130/276130 [04:06<00:00, 1121.45it/s]


MODEL SAVED


[tensor(681882., device='cuda:0', grad_fn=<AddBackward0>),
 tensor(649145., device='cuda:0', grad_fn=<AddBackward0>),
 tensor(642236.3750, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(638825.5000, device='cuda:0', grad_fn=<AddBackward0>),
 tensor(636526.8125, device='cuda:0', grad_fn=<AddBackward0>)]

In [None]:
r = torch.load('models/model_cbow_negTrue.pth')
print(r)

{'model_state_dict': OrderedDict([('embeddings_input.weight', tensor([[ 0.1146,  1.0594, -0.2355,  ...,  0.2623,  0.0352, -0.3516],
        [ 0.6221, -0.5655, -1.8811,  ...,  1.6409,  0.7659, -0.0264],
        [-0.9397, -0.7928,  0.1202,  ..., -0.8976, -1.5699, -0.4129],
        ...,
        [-0.4142, -0.8826, -0.6462,  ...,  1.2085, -0.3424,  0.2064],
        [ 1.8511,  0.0413,  1.3784,  ..., -1.6358,  0.6703,  1.6557],
        [ 1.8665, -0.8989,  0.7596,  ...,  1.0907, -0.2864, -1.4045]],
       device='cuda:0')), ('linear.weight', tensor([[ 0.0196,  0.1580, -0.0417,  ..., -0.0100, -0.0564,  0.0432],
        [ 0.0560,  0.0138,  0.0629,  ..., -0.0948, -0.0449,  0.0696],
        [ 0.0305,  0.0905,  0.0163,  ..., -0.0414, -0.0616, -0.0765],
        ...,
        [-0.0931,  0.0216,  0.0804,  ...,  0.0765, -0.0265, -0.0260],
        [-0.0793, -0.0077,  0.0575,  ...,  0.0188,  0.0252, -0.0913],
        [ 0.0016, -0.0434, -0.0680,  ..., -0.0712,  0.0537, -0.0935]],
       device='cuda:0')), 