In [1]:
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import fasttext
import fasttext.util
from numpy.linalg import norm
from tqdm import tqdm
from scipy import stats

In [2]:
input_data = []

with open('./sts-2017-en-es/En_Es_STS/STS.input.en-es.train.txt', 'r') as inFile:
    input_data = inFile.readlines()

X = []
for data in input_data:
    X.append(data.split('\t')[:2])
input_data.clear()

scores = []
with open('./sts-2017-en-es/En_Es_STS/STS.input.en-es.train_scores.txt', 'r') as inFile:
    # scores = inFile.readlines()
    for line in inFile:
        scores.append(int(line.strip()))

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

def sentence_preprocessor(data, language):
    cleaned_data = []
    stop_words = set(stopwords.words(language))
    for x in data:
        for c in string.punctuation:
            if c in x:
                x = x.replace(c,'')
        tokenized_x = word_tokenize(x)
        tokenized_x = [word for word in tokenized_x if word not in stop_words]

        cleaned_data.append(tokenized_x)
    return cleaned_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home2/arihanth.srikar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home2/arihanth.srikar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
eng_data, esp_data = tuple(zip(*X))

cleaned_eng = sentence_preprocessor(eng_data, 'english')
cleaned_esp = sentence_preprocessor(esp_data, 'spanish')

In [5]:
MAX_ENG_SENT = 0
for sent in cleaned_eng:
    if MAX_ENG_SENT < len(sent):
        MAX_ENG_SENT = len(sent)

MAX_ESP_SENT = 0
for sent in cleaned_esp:
    if MAX_ESP_SENT < len(sent):
        MAX_ESP_SENT = len(sent)

print(MAX_ENG_SENT, MAX_ESP_SENT)

custom_data = zip(cleaned_eng, cleaned_esp)
custom_data = sorted(custom_data, key=lambda x: len(x[0]), reverse=False)
cleaned_eng, cleaned_esp = list(zip(*custom_data))

41 40


In [6]:
len(cleaned_eng)

1000

In [7]:
# fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('/scratch/arihanth.srikar/models/cc.en.300.bin')
ft.get_dimension()



300

In [8]:
ft2 = fasttext.load_model('/scratch/arihanth.srikar/models/cc.es.300.bin')
ft2.get_dimension()



300

In [9]:
# fasttext.util.reduce_model(ft, 100)
# ft.get_dimension()

In [10]:
# fasttext.util.reduce_model(ft2, 100)
# ft2.get_dimension()

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [13]:
final_embeddings = []
for (eng_sent, esp_sent) in X:
    eng_vect = torch.cat([torch.tensor([ft.get_sentence_vector(word)]) for word in eng_sent], dim=0).unsqueeze(dim=1)
    esp_vect = torch.cat([torch.tensor([ft2.get_sentence_vector(word)]) for word in esp_sent], dim=0).unsqueeze(dim=1)
    final_embeddings.append((eng_vect, esp_vect))

  eng_vect = torch.cat([torch.tensor([ft.get_sentence_vector(word)]) for word in eng_sent], dim=0).unsqueeze(dim=1)


In [14]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, sentences, scores):
        self.labels = scores
        self.sentences = sentences

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return self.labels[idx]

    def get_batch_embeddings(self, idx):
        eng_sent, esp_sent = self.sentences[idx]
        return eng_sent, esp_sent

    def __getitem__(self, idx):

        embeddings = self.get_batch_embeddings(idx)
        label = self.get_batch_labels(idx)

        return embeddings, torch.tensor(label)

In [15]:
full_dataset = Dataset(final_embeddings, scores)

In [16]:
class LSTM(nn.Module):

    def __init__(self, embedding_dim=300, hidden_dim=512, output_size=300):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = 2

        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=self.num_layers, bidirectional=True)
        self.lstm2 = nn.LSTM(embedding_dim, hidden_dim, num_layers=self.num_layers, bidirectional=True)

    def forward(self, v1, v2):
        _, (h, _) = self.lstm1(v1.view(v1.shape[0], 1, -1))
        out1 = torch.mean(h.view(h.shape[0], -1), dim=0)
        
        _, (h, _) = self.lstm2(v2.view(v2.shape[0], 1, -1))
        out2 = torch.mean(h.view(h.shape[0], -1), dim=0)
        
        return out1, out2

In [17]:
model = LSTM(300, 512, 300).to(device)
loss_function = F.mse_loss
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

In [18]:
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
trainloader, testloader = torch.utils.data.random_split(full_dataset, [train_size, test_size])

In [19]:
for epoch in range(15):

    model.train()
    pred, lbls = [], []
    running_loss = 0.0
    for (e1, e2), lbl in tqdm(trainloader):
        model.zero_grad()

        e1 = e1.to(device)
        e2 = e2.to(device)
        lbl = lbl.to(device)

        v1, v2 = model(e1, e2)

        cos = nn.CosineSimilarity(dim=0)
        output = cos(v1, v2)*5

        loss = loss_function(output.float(), lbl.float())
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        pred.append(output.float().item())
        lbls.append(lbl.float().item())
    print(f'{epoch}-Train')
    print('Pearson:', stats.pearsonr(pred, lbls)[0]*100)
    print('Loss:', running_loss/len(trainloader))

    model.eval()
    pred, lbls = [], []
    with torch.no_grad():
        running_loss = 0.0
        for (e1, e2), lbl in tqdm(testloader):
            e1 = e1.to(device)
            e2 = e2.to(device)
            lbl = lbl.to(device)

            tag_scores_eng, tag_scores_esp = model(e1, e2)

            cos = nn.CosineSimilarity(dim=0)
            output = cos(tag_scores_eng, tag_scores_esp)*5

            loss = loss_function(output.float(), lbl.float())
            running_loss += loss.item()

            pred.append(output.float().item())
            lbls.append(lbl.float().item())
        print(f'{epoch}-Test')
        print('Pearson:', stats.pearsonr(pred, lbls)[0]*100)
        print('Loss', running_loss/len(testloader))

100%|██████████| 800/800 [00:27<00:00, 29.19it/s]


0-Train
Pearson: -1.6041250064503827
Loss: 1.7300979354920536


100%|██████████| 200/200 [00:02<00:00, 71.98it/s]


0-Test
Pearson: 9.049861987827622
Loss 1.3731650733630523


100%|██████████| 800/800 [00:27<00:00, 29.21it/s]


1-Train
Pearson: -0.8354525962123258
Loss: 1.6150989110100642


100%|██████████| 200/200 [00:02<00:00, 72.59it/s]


1-Test
Pearson: 9.055542332732502
Loss 1.3619173192800371


100%|██████████| 800/800 [00:27<00:00, 29.02it/s]


2-Train
Pearson: 3.5759542724025137
Loss: 1.5882426767485232


100%|██████████| 200/200 [00:02<00:00, 71.27it/s]


2-Test
Pearson: 8.027496633609116
Loss 1.363239804833138


100%|█████████▉| 797/800 [00:27<00:00, 31.17it/s]

In [None]:
torch.save(model, '/scratch/arihanth.srikar/model_saves/bilingual.pt')