In [1]:
import torch
import torch.nn as nn
import string
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import numpy as np
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import csv
import spacy
from tqdm import tqdm

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
spacy_en = spacy.load("en_core_web_sm")

In [3]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, datalocation):
        self.datalocation = datalocation
        self.data = []
        self.scores = []
        self.vocab = set({'<unk>'})
        self.word2Index = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.index2Word = {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.wordFrequency = dict()
        print("Reading data...")
        self.read_data()
        print("Preprocessing data...")
        self.preprocess()
        print("Handling unknown tokens...")
        self.handle_unkown_token()
        

    def read_data(self):
        csvreader = csv.reader(open(self.datalocation, 'r'), delimiter='\t')
        for row in csvreader:
            try:
                self.data.append((row[5],row[6]))
                self.scores.append(row[4])
            except:
                continue
        
    def clean_data(self, tokenized_sentence):
        cleaned_sentence = []
        for token in tokenized_sentence:
            if token not in string.punctuation:
                cleaned_sentence.append(token)
    
        return cleaned_sentence
    
    def preprocess(self):
        for idx in tqdm(range(len(self.data))):
            s1 = [tok.text for tok in spacy_en.tokenizer(self.data[idx][0].lower())]
            s2 = [tok.text for tok in spacy_en.tokenizer(self.data[idx][1].lower())]
            s1 = self.clean_data(s1)
            s2 = self.clean_data(s2)
            for token in s1:
                if token not in self.wordFrequency:
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
        
            for token in s2:
                if token not in self.wordFrequency:
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
            
            self.data[idx] = (s1, s2)

        
    def handle_unkown_token(self):
        self.unkown_token = '<unk>'
        self.word2Index[self.unkown_token] = len(self.word2Index)
        self.index2Word[len(self.index2Word)] = self.unkown_token
        print(self.data[0])

        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]

            for i in range(len(s1)):
                word = s1[i]
                if self.wordFrequency[word] < 2:
                    s1[i] = self.unkown_token
            
            for i in range(len(s2)):
                word = s2[i]
                if self.wordFrequency[word] < 2:
                    s2[i] = self.unkown_token
            
            self.data[idx] = (s1, s2)
        
        print(self.data[0])
        
        self.wordFrequency = dict()
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]

            for token in s1:
                self.vocab.add(token)
                if token not in self.wordFrequency:
                    self.word2Index[token] = len(self.word2Index)
                    self.index2Word[len(self.index2Word)] = token
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
        
            for token in s2:
                self.vocab.add(token)
                if token not in self.wordFrequency:
                    self.word2Index[token] = len(self.word2Index)
                    self.index2Word[len(self.index2Word)] = token
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
            
            s1 = ['<sos>'] + s1 + ['<eos>']
            s2 = ['<sos>'] + s2 + ['<eos>']

            if len(s1) > len(s2):
                s2 = s2 + ['<pad>'] * (len(s1) - len(s2))
            elif len(s1) < len(s2):
                s1 = s1 + ['<pad>'] * (len(s2) - len(s1))
            
            self.data[idx] = (s1, s2)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return (
            torch.tensor([self.word2Index[word] for word in self.data[idx][0]]),
            torch.tensor([self.word2Index[word] for word in self.data[idx][1]]),
            torch.tensor([float(self.scores[idx])])
        )

In [4]:
data = MyDataset('../stsbenchmark/sts-train.csv')

Reading data...
Preprocessing data...


100%|██████████| 5708/5708 [00:00<00:00, 7383.77it/s] 


Handling unknown tokens...
(['a', 'plane', 'is', 'taking', 'off'], ['an', 'air', 'plane', 'is', 'taking', 'off'])


100%|██████████| 5708/5708 [00:00<00:00, 353400.06it/s]


(['a', 'plane', 'is', 'taking', 'off'], ['an', 'air', 'plane', 'is', 'taking', 'off'])


100%|██████████| 5708/5708 [00:00<00:00, 143321.18it/s]


In [5]:
data[1]
print(len(data.vocab))

7734


In [6]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, batch_size, dropout):
        super(LSTMEncoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.embedding = nn.Embedding(self.input_size, self.embed_size)
        self.lstm = nn.LSTM(self.embed_size, self.hidden_size, self.num_layers, dropout=dropout)
    
    def init_states(self):
        h0 = Variable(torch.randn(self.num_layers, self.batch_size, self.hidden_size))
        c0 = Variable(torch.randn(self.num_layers, self.batch_size, self.hidden_size))
        return h0, c0

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

In [7]:
class LSTMSiamese(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, batch_size, fc_size, dropout):
        super(LSTMSiamese, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.embed_size = embed_size
        self.fc_size = 5 * hidden_size
        self.dropout = dropout
        self.encoder = LSTMEncoder(input_size, embed_size, hidden_size, num_layers, batch_size, dropout)
        self.classifier = nn.Sequential(
            nn.Linear(self.fc_size, int(self.fc_size/2)),
            nn.Linear(int(self.fc_size/2), 5)
        )

    def forward(self, input1, input2):
        h1, c1 = self.encoder.init_states()
        h2, c2 = self.encoder.init_states()

        for i in range(len(input1)):
            output1, (h1, c1) = self.encoder(input1[i], (h1, c1))
        
        for i in range(len(input2)):
            output2, (h2, c2) = self.encoder(input2[i], (h2, c2))

        features = torch.cat((output1,torch.abs(output1 - output2),output2,output1*output2, (output1+output2)/2), 2)

        output = self.classifier(features)
        return output
        

In [8]:
EPOCHS = 10
BATCH_SIZE = 32
model = LSTMSiamese(len(data.vocab), 300, 300, 2, 1, 300, 0.2)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [9]:
for epoch in range(EPOCHS):
    loss_sum = 0

    train_loss = []
    train_loss_sum = []
    # dataloader
    train_dataloader = DataLoader(dataset=data)
    
    for idx, data in enumerate(train_dataloader):
        input1, input2, score = data
        print(input1.shape, input2.shape)
        output = model(input1, input2)
        loss = criterion(output, score)
        loss_sum += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        if idx % 100 == 0:
            print(f'Epoch: {epoch}/{EPOCHS}, Batch: {idx}/{len(train_dataloader)}, Loss: {loss.item()}')
    train_loss_sum.append(loss_sum)
    print(f'Epoch: {epoch}/{EPOCHS}, Loss: {loss_sum}')

torch.save(model.state_dict(), 'model_.pth')

torch.Size([1, 8]) torch.Size([1, 8])


RuntimeError: input.size(-1) must be equal to input_size. Expected 300, got 2400