In [1]:
import enum
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import math

torch.manual_seed(1)

<torch._C.Generator at 0x7f828372df30>

In [2]:
BATCH_SIZE = 8
EPOCHS = 10
SEQUENCE_LENGTH = 4

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, file_location, sequence_length):
        self.file_location = file_location
        self.sequence_length = sequence_length
        self.initialize_data()
        self.modify()

    def initialize_data(self):
        with open(self.file_location, "r") as inFile:
            data = inFile.readlines()

        tokenized_data = self.tokenizer(data)
        self.dataset = self.cleaner(tokenized_data)
        (
            self.word2Index,
            self.index2Word,
            self.vocab_size,
            self.vocab,
            self.wordFrequency
        ) = self.vocabBuilder(self.dataset)
        self.words = list()
        for sentence in self.dataset:
            for word in sentence:
                self.words.append(word)

        self.words_indexes = [self.word2Index[word] for word in self.words]

    def tokenizer(self,corpus):
        """
            tokenizes the corpus
            
            Arguments:
                corpus (list)

            Returns:
                tokenized corpus (list)
        """
        hashtag_regex = "#[a-zA-Z0-9]+"
        url_regex = "((http|https)://)(www.)?[a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)"
        mention_regex = "@\w+"

        processed_corpus = list()

        for tweet in corpus:
            normalized_tweet = tweet.lower()
            hashtag_removed_tweet = re.sub(hashtag_regex, "<HASHTAG>", normalized_tweet)
            website_removed_tweet = re.sub(url_regex, "<URL>", hashtag_removed_tweet)
            mention_removed_tweet = re.sub(
                mention_regex, "<MENTION>", website_removed_tweet
            )
            punctuation_repeat_removed = re.sub(
                r"(\W)(?=\1)", "", mention_removed_tweet
            )
            tokenized_tweet = punctuation_repeat_removed.split()

            cleaned_tokenized_tweet = list()
            for token in tokenized_tweet:
                if token not in ["<HASHTAG>", "<URL>", "<MENTION>", "<OOV>"]:
                    split_tokens = "".join(
                        (char if char.isalpha() or char.isnumeric() else f" {char} ")
                        for char in token
                    ).split()
                    for cleaned_token in split_tokens:
                        cleaned_tokenized_tweet.append(cleaned_token)

                else:
                    cleaned_tokenized_tweet.append(token)

            processed_corpus.append(cleaned_tokenized_tweet)

        return processed_corpus

    def cleaner(self,corpus):
        """
            replacing !,?,. with . and removing other punctuations
            
            Arguments:
                tokenized corpuse (list)

            Returns:
                cleaned corpus (list)
        """
        import string

        cleaned_corpus = list()

        for sentence in corpus:
            new_sentence = list()
            for token in sentence:
                if token in ["!", ".", "?"]:
                    new_sentence.append(".")
                elif token in string.punctuation:
                    continue
                else:
                    new_sentence.append(token)

            cleaned_corpus.append(new_sentence)

        return cleaned_corpus

    def vocabBuilder(self,corpus):
        """
            Builds the vocabulary of the input dataset.

            Arguments:
                The cleaned tokenized the dataset
            
            Returns:
                Word to Index dict, Index to Word list, Number of Unique Words, Set of Vocab
        """
        word2Index = dict()
        index2Word = list()
        vocab = set()
        wordFrequency = dict()

        n_unique_words = 0

        for sentence in corpus:
            for word in sentence:
                vocab.add(word)
                if word not in word2Index:
                    word2Index[word] = n_unique_words
                    index2Word.append(word)
                    n_unique_words += 1
                    wordFrequency[word] = 1
                else:
                    wordFrequency[word] += 1

        return word2Index, index2Word, n_unique_words, vocab, wordFrequency
    
    def modify(self):
        for i in range(len(self.dataset)):
            for j in range(len(self.dataset[i])):
                if self.wordFrequency[self.dataset[i][j]] < 2:
                    self.dataset[i][j] = '<OOV>'
                elif any(character.isdigit() for character in self.dataset[i][j]):
                    self.dataset[i][j] = '<OOV>'
        
        self.dataset = self.cleaner(self.dataset)
        (
            self.word2Index,
            self.index2Word,
            self.vocab_size,
            self.vocab,
            self.wordFrequency
        ) = self.vocabBuilder(self.dataset)
        self.words = list()
        for sentence in self.dataset:
            for word in sentence:
                self.words.append(word)

        self.words_indexes = [self.word2Index[word] for word in self.words]

    def __len__(self):
        return self.vocab_size - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index : index + self.sequence_length]).to(device),
            torch.tensor(
                self.words_indexes[index + 1 : index + self.sequence_length + 1]
            ).to(device),
        )


In [5]:
class RNN(nn.Module):
    def __init__(
        self,
        dataset,
        lstm_size=128,
        n_layers=3,
        embedding_dim=128,
    ):
        super(RNN, self).__init__()
        self.vocab_size = dataset.vocab_size
        self.input_dim = lstm_size
        self.embedding_dim = embedding_dim
        self.lstm_hidden_dim = lstm_size
        self.n_layers = n_layers
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size, embedding_dim=self.embedding_dim
        )
        self.lstm = nn.LSTM(
            input_size=self.input_dim,
            hidden_size=self.lstm_hidden_dim,
            num_layers=self.n_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_hidden_dim, self.vocab_size)

    def forward(self, x, prev_state):
        embed = self.embedding_layer(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (
            torch.zeros(self.n_layers, sequence_length, self.lstm_hidden_dim).to(device),
            torch.zeros(self.n_layers, sequence_length, self.lstm_hidden_dim).to(device),
        )

In [6]:
dataset = Dataset("./data/europarl-corpus/train.europarl", SEQUENCE_LENGTH)

In [7]:
model = RNN(dataset).to(device)

model.train().to(device)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    state_h, state_c = model.init_state(SEQUENCE_LENGTH)
    net_loss = 0.0

    for (x,y) in tqdm(dataloader):
        optimizer.zero_grad()
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        loss = criterion(y_pred.transpose(1,2),y)
        net_loss += loss.item()

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss.backward()
        optimizer.step()

    print({ 'epoch': epoch, 'loss': net_loss/len(dataloader) })

100%|██████████| 1138/1138 [00:11<00:00, 98.31it/s]


{'epoch': 0, 'loss': 6.449970442088082}


100%|██████████| 1138/1138 [00:11<00:00, 98.53it/s]


{'epoch': 1, 'loss': 5.451280767762598}


100%|██████████| 1138/1138 [00:11<00:00, 97.77it/s]


{'epoch': 2, 'loss': 4.896287033763208}


100%|██████████| 1138/1138 [00:11<00:00, 97.49it/s]


{'epoch': 3, 'loss': 4.309450437084024}


100%|██████████| 1138/1138 [00:11<00:00, 96.91it/s]


{'epoch': 4, 'loss': 3.811718226736166}


100%|██████████| 1138/1138 [00:11<00:00, 96.96it/s]


{'epoch': 5, 'loss': 3.273213569539712}


100%|██████████| 1138/1138 [00:11<00:00, 97.21it/s]


{'epoch': 6, 'loss': 2.7680134095762234}


100%|██████████| 1138/1138 [00:11<00:00, 96.97it/s]


{'epoch': 7, 'loss': 2.302195867644672}


100%|██████████| 1138/1138 [00:11<00:00, 97.22it/s]


{'epoch': 8, 'loss': 1.8665471038861308}


100%|██████████| 1138/1138 [00:11<00:00, 97.10it/s]

{'epoch': 9, 'loss': 1.4455520478330932}





In [12]:
def predict(dataset, model, text, next_words=10):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word2Index[w] for w in words[i:]]]).to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        word_index = torch.argmax(last_word_logits).item()
        words.append(dataset.index2Word[word_index])

    return words

In [12]:
print(predict(dataset, model, text='although as you will have seen'))

['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'commission', 'has', 'now', 'effect', 'in', 'the', 'creation', 'of', 'jobs']


In [8]:
torch.save(model, './models/LM1.pth')

In [7]:
model = torch.load("./models/LM1.pth").to(device)

In [8]:
def perplexity(dataset, model, sequence_length):
    model.eval()
    total_probability = 0.0
    sentence_number = 0
    for sentence in tqdm(dataset.dataset):
        words = sentence
        state_h, state_c = model.init_state(sequence_length)
        net_probability = 1.0
        for i in range(len(words)-sequence_length):
            x = torch.tensor([[dataset.word2Index[w] for w in words[i:i+sequence_length]]], device=device)
            y = [dataset.word2Index[w] for w in words[i+1:i+sequence_length+1]]
            # print(x)
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))

            last_word_logits = y_pred[0][-1]
            # print(last_word_logits.shape)
            last_word_logits = nn.functional.softmax(last_word_logits)
            probability = last_word_logits[y[-1]]
            word_index = torch.argmax(last_word_logits).item()
            words.append(dataset.index2Word[word_index])
            net_probability *= probability

        
        if len(sentence) == 0 or len(sentence) < sequence_length:
            continue

        N = len(sentence)
        net_probability = (1/net_probability)**(1/N)
        # print({'sentence':sentence, 'perplexity':net_probability})
        total_probability += net_probability
        sentence_number += 1

    return total_probability / sentence_number

In [9]:
print(perplexity(dataset, model, SEQUENCE_LENGTH))

  last_word_logits = nn.functional.softmax(last_word_logits)
  2%|▏         | 302/20000 [00:04<04:31, 72.55it/s]


RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

In [None]:
with open("./models/lm_en_vocab.txt","w") as outFile:
    outFile.write(str(dataset.vocab))

In [None]:
import ast
with open("./models/lm_en_vocab.txt","r") as inFile:
    vocab = ast.literal_eval(inFile.read())

In [None]:
dataset = Dataset("./data/europarl-corpus/test.europarl", 3)

In [None]:
print(perplexity(dataset, model, 3))

  last_word_logits = nn.functional.softmax(last_word_logits)
 90%|█████████ | 905/1000 [00:15<00:01, 56.85it/s] 


RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED