In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string 
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize

In [2]:
input_data = []

with open('./sts-2017-en-es/En_Es_STS/STS.input.en-es.train.txt', 'r') as inFile:
    input_data = inFile.readlines()

X = []
for data in input_data:
    X.append(data.split('\t')[:2])

In [3]:
lables = pd.read_csv('./sts-2017-en-es/En_Es_STS/STS.input.en-es.train_scores.txt', encoding='utf-8')

In [4]:
def sentence_preprocessor(data):
    cleaned_data = []
    for x in data:
        for c in string.punctuation:
            if c in x:
                x = x.replace(c,'')
        tokenized_x = word_tokenize(x)
        cleaned_data.append(tokenized_x)
    
    return cleaned_data

def vocab_maker(data):
    vocab = {}
    index=0
    for sentence in data:
        for word in sentence:
            if word in vocab:
                continue
            else:
                vocab[word] = index
                index+=1
    
    return vocab

def vectorizer(data, vocab):
    x = np.zeros((len(data),len(vocab.keys())))
    for index in range(len(data)):
        sentence = data[index]
        for word in sentence:
            x[index][vocab[word]] += 1
    return x

In [5]:
eng_data, esp_data = tuple(zip(*X))

cleaned_eng = sentence_preprocessor(eng_data)
cleaned_esp = sentence_preprocessor(esp_data)

In [6]:
# make vocab
eng_vocab = vocab_maker(cleaned_eng)
esp_vocab = vocab_maker(cleaned_esp)

In [7]:
# generating embeddings
eng_vectors = vectorizer(cleaned_eng, eng_vocab)
esp_vectors = vectorizer(cleaned_esp, esp_vocab)

In [8]:
my_array = zip(eng_vectors, esp_vectors)
text = pd.DataFrame(my_array, columns = ['english','spanish'])

In [9]:
eng_vectors.shape
esp_vectors.shape

(1000, 5274)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [19]:
class LSTMTagger(torch.nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size1, vocab_size2, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings1 = torch.nn.Embedding(vocab_size1, embedding_dim)
        self.word_embeddings2 = torch.nn.Embedding(vocab_size2, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)

        # self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    def forward(self, sentence1, sentence2):
        embeds1 = self.word_embeddings1(sentence1)
        lstm_out1, _ = self.lstm(embeds1.view(len(sentence1), 1, -1))
        tag_space1 = self.hidden2tag(lstm_out1.view(len(sentence1), -1))
        
        embeds2 = self.word_embeddings2(sentence2)
        lstm_out2, _ = self.lstm(embeds2.view(len(sentence2), 1, -1))
        tag_space2 = self.hidden2tag(lstm_out2.view(len(sentence2), -1))

        print(tag_space1.shape)
        print(tag_space2.shape)
        
        tag_scores = nn.CosineSimilarity(tag_space1, tag_space2)
        
        return tag_scores

In [12]:
lstm = LSTMTagger(512, 256, eng_vectors.shape[1], esp_vectors.shape[1], 128)

In [13]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(lstm.parameters(), lr=0.001, momentum=0.9)

In [14]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, text, lables):
        self.labels = [label for label in lables['1']]
        self.texts = [[text['english'][i], text['spanish'][i]] for i in range(len(text))]

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx]

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [15]:
trainloader = Dataset(text, lables)

In [20]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for texts, lables in trainloader:
        # get the texts; data is a list of [texts, labels]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = lstm(torch.tensor(texts[0], dtype=int), torch.tensor(texts[1], dtype=int))
        loss = criterion(outputs, lables)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        # if i % 2000 == 1999:    # print every 2000 mini-batches
        #     print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
        #     running_loss = 0.0

print('Finished Training')

[1. 1. 1. ... 0. 0. 0.]
[1. 1. 1. ... 0. 0. 0.]


RuntimeError: The size of tensor a (4896) must match the size of tensor b (5274) at non-singleton dimension 0