In [2]:
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import fasttext
import fasttext.util
from numpy.linalg import norm
from tqdm import tqdm
from scipy import stats
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('data/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

In [4]:
X = train_data['x']
y = train_data['y']

In [None]:
ft = fasttext.load_model('/scratch/arihanth.srikar/models/cc.en.300.bin')
ft.get_dimension()

In [None]:
# fasttext.util.reduce_model(ft, 100)
# ft.get_dimension()

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
final_embeddings = []
for (eng_sent, esp_sent) in X:
    eng_vect = torch.cat([torch.tensor([ft.get_sentence_vector(word)]) for word in eng_sent], dim=0).unsqueeze(dim=1)
    esp_vect = torch.cat([torch.tensor([ft.get_sentence_vector(word)]) for word in esp_sent], dim=0).unsqueeze(dim=1)
    final_embeddings.append((eng_vect, esp_vect))

NameError: name 'ft' is not defined

In [7]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence

In [8]:
glove_embedder = WordEmbeddings('glove')

In [9]:
final_embeddings = []
for (e1, e2) in X:
    v1 = Sentence(e1)
    v2 = Sentence(e2)
    v1 = [token.embedding.cpu() for token in glove_embedder.embed(v1)[0]]
    v2 = [token.embedding.cpu() for token in glove_embedder.embed(v2)[0]]
    final_embeddings.append((v1, v2))

In [24]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, sentences, scores):
        self.labels = scores
        self.sentences = sentences

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return self.labels[idx]

    def get_batch_embeddings(self, idx):
        e1, e2 = self.sentences[idx]
        return torch.vstack(e1), torch.vstack(e2)

    def __getitem__(self, idx):

        embeddings = self.get_batch_embeddings(idx)
        label = self.get_batch_labels(idx)

        return embeddings, torch.tensor(label)

In [25]:
full_dataset = Dataset(final_embeddings, y)

In [26]:
(e1,e2), lbl = next(iter(full_dataset))

tensor([[-0.2709,  0.0440, -0.0203,  ..., -0.4923,  0.6369,  0.2364],
        [ 0.5937,  0.4482,  0.5932,  ..., -0.5465,  0.1516, -0.3075],
        [ 0.2616,  0.4472, -0.0968,  ..., -0.4503,  0.4952, -0.2030],
        ...,
        [ 0.0523,  0.4112, -0.5290,  ..., -0.9391,  0.9866, -0.0713],
        [-0.1440,  0.3255,  0.1426,  ...,  0.2540,  1.1078, -0.0731],
        [ 0.6848, -0.3764, -0.0787,  ...,  0.4923,  0.1823, -0.0202]])

In [27]:
class LSTM(nn.Module):

    def __init__(self, embedding_dim=300, hidden_dim=256, output_size=300):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = 1

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=self.num_layers, bidirectional=True)

    def forward(self, v1):
        out, (h, c) = self.lstm(v1.view(v1.shape[0], 1, -1))
        out = torch.mean(h.view(h.shape[0], -1), dim=0)
        
        return out

In [28]:
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
trainloader, testloader = torch.utils.data.random_split(full_dataset, [train_size, test_size])

In [32]:
model = LSTM(100, 256, 300).to(device)
loss_function = F.mse_loss
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

In [33]:
# model = torch.load('/scratch/arihanth.srikar/model_saves/model.pt')

In [34]:
for epoch in range(15):

    model.train()
    pred, lbls = [], []
    running_loss = 0.0
    for (e1, e2), lbl in tqdm(trainloader):
        model.zero_grad()

        e1 = e1.to(device)
        e2 = e2.to(device)
        lbl = lbl.to(device)

        # Run our forward pass.
        v1 = model(e1)
        v2 = model(e2)

        cos = nn.CosineSimilarity(dim=0)
        output = ((cos(v1, v2)+1)/2)*5

        # Compute the loss, gradients, and update the parameters by
        loss = loss_function(output.float(), lbl.float())
        running_loss += loss.item()
        loss.backward()
        optimizer.step()

        pred.append(output.float().item())
        lbls.append(lbl.float().item())

    print(f'{epoch}-Train')
    print('Pearson:', stats.pearsonr(pred, lbls)[0]*100)
    print('Loss:', running_loss/len(trainloader))

    model.eval()
    pred, lbls = [], []
    with torch.no_grad():
        running_loss = 0.0
        for (e1, e2), lbl in tqdm(testloader):
            e1 = e1.to(device)
            e2 = e2.to(device)
            lbl = lbl.to(device)

            v1 = model(e1)
            v2 = model(e2)

            cos = nn.CosineSimilarity(dim=0)
            output = ((cos(v1, v2)+1)/2)*5

            loss = loss_function(output.float(), lbl.float())
            running_loss += loss.item()

            pred.append(output.float().item())
            lbls.append(lbl.float().item())

        print(f'{epoch}-Test')
        print('Pearson:', stats.pearsonr(pred, lbls)[0]*100)
        print('Loss', running_loss/len(testloader))

100%|██████████| 3060/3060 [00:12<00:00, 241.72it/s]


0-Train
Pearson: 57.3004237612856
Loss: 0.8799223235978819


100%|██████████| 765/765 [00:01<00:00, 744.67it/s]


0-Test
Pearson: 64.72333552303313
Loss 0.7236596801012652


100%|██████████| 3060/3060 [00:12<00:00, 240.91it/s]


1-Train
Pearson: 73.50296079079219
Loss: 0.5597016836982143


100%|██████████| 765/765 [00:00<00:00, 767.12it/s]


1-Test
Pearson: 69.30907354028201
Loss 0.6097766486230555


100%|██████████| 3060/3060 [00:12<00:00, 246.60it/s]


2-Train
Pearson: 79.5148208526783
Loss: 0.43208297021701747


100%|██████████| 765/765 [00:01<00:00, 755.61it/s]


2-Test
Pearson: 70.91427925315386
Loss 0.5701366403293131


100%|██████████| 3060/3060 [00:12<00:00, 241.49it/s]


3-Train
Pearson: 83.10494104023749
Loss: 0.35593900537114115


100%|██████████| 765/765 [00:00<00:00, 775.63it/s]


3-Test
Pearson: 71.80356265045805
Loss 0.5495452548515029


100%|██████████| 3060/3060 [00:12<00:00, 240.16it/s]


4-Train
Pearson: 85.73029421344643
Loss: 0.3005864744952244


100%|██████████| 765/765 [00:00<00:00, 785.88it/s]


4-Test
Pearson: 72.40109407124002
Loss 0.5351954344893138


100%|██████████| 3060/3060 [00:12<00:00, 240.32it/s]


5-Train
Pearson: 87.73543612282991
Loss: 0.2583135855423007


100%|██████████| 765/765 [00:00<00:00, 785.33it/s]


5-Test
Pearson: 72.84222122325959
Loss 0.523507411055253


100%|██████████| 3060/3060 [00:12<00:00, 240.12it/s]


6-Train
Pearson: 89.36736221093604
Loss: 0.22387354234919038


100%|██████████| 765/765 [00:01<00:00, 750.46it/s]


6-Test
Pearson: 73.19307412325571
Loss 0.5145102047839213


100%|██████████| 3060/3060 [00:12<00:00, 241.54it/s]


7-Train
Pearson: 90.75505086261583
Loss: 0.19475192078920087


100%|██████████| 765/765 [00:01<00:00, 761.37it/s]


7-Test
Pearson: 73.45218644730316
Loss 0.5084282413685858


100%|██████████| 3060/3060 [00:12<00:00, 244.80it/s]


8-Train
Pearson: 91.92567330305688
Loss: 0.17028243664253412


100%|██████████| 765/765 [00:00<00:00, 777.74it/s]


8-Test
Pearson: 73.69412323863949
Loss 0.5035641071931165


100%|██████████| 3060/3060 [00:12<00:00, 243.16it/s]


9-Train
Pearson: 92.93813281245919
Loss: 0.14924011097580495


100%|██████████| 765/765 [00:01<00:00, 751.75it/s]


9-Test
Pearson: 73.93399840060728
Loss 0.499171069495039


100%|██████████| 3060/3060 [00:12<00:00, 240.40it/s]


10-Train
Pearson: 93.83855193569826
Loss: 0.13060496538788646


100%|██████████| 765/765 [00:01<00:00, 762.10it/s]


10-Test
Pearson: 74.08658115365121
Loss 0.49630634886570574


100%|██████████| 3060/3060 [00:12<00:00, 243.23it/s]


11-Train
Pearson: 94.6248899452187
Loss: 0.11423322921105047


100%|██████████| 765/765 [00:01<00:00, 764.09it/s]


11-Test
Pearson: 74.49344917696538
Loss 0.488829005669335


100%|██████████| 3060/3060 [00:12<00:00, 244.89it/s]


12-Train
Pearson: 95.35846714271952
Loss: 0.09896007900938229


100%|██████████| 765/765 [00:00<00:00, 794.55it/s]


12-Test
Pearson: 74.79927159584503
Loss 0.4819885695894497


100%|██████████| 3060/3060 [00:12<00:00, 248.05it/s]


13-Train
Pearson: 95.94042490432591
Loss: 0.08682540422429277


100%|██████████| 765/765 [00:00<00:00, 767.09it/s]


13-Test
Pearson: 74.43574095333216
Loss 0.4903849377704773


100%|██████████| 3060/3060 [00:12<00:00, 240.47it/s]


14-Train
Pearson: 96.31879091073412
Loss: 0.07870243201078223


100%|██████████| 765/765 [00:00<00:00, 770.77it/s]

14-Test
Pearson: 75.16635626470858
Loss 0.47466464452382495





In [None]:
torch.save(model, '/scratch/arihanth.srikar/model_saves/monolingual.pt')

In [36]:
pred, lbls = [], []
for (e1, e2), lbl in full_dataset:
    
    v1 = torch.mean(e1.view(e1.shape[0], -1), dim=0)
    v2 = torch.mean(e2.view(e2.shape[0], -1), dim=0)

    cos = nn.CosineSimilarity(dim=0)
    output = cos(v1, v2)*5

    pred.append(output.float().item())
    lbls.append(lbl.float().item())

print('Pearson:', stats.pearsonr(pred, lbls)[0]*100)

Pearson: 60.73496895047812
