In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy
import spacy
import numpy as np
import random
import math
import time
from tqdm import tqdm
import re

In [2]:
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)

In [3]:
en_spacy = spacy.load('en_core_web_sm')
fr_spacy = spacy.load('fr_core_news_sm')

In [4]:
BATCH_SIZE = 16
MAX_EPOCHS = 10
EMBEDDING_SIZE = 128
HIDDEN_DIMENSION = 128
LAYERS = 3

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cuda')

In [6]:
class Data(Dataset):
    def __init__(self, en_location, fr_location):
        self.en_location = en_location
        self.fr_location = fr_location
        self.corpusSize = 0
        self.processed_en_dataset = list()
        self.processed_fr_dataset = list()
        self.ENvocab = set()
        self.ENword2Index = dict()
        self.ENindex2Word = list()
        self.ENwordFrequency = dict()
        self.ENvocabSize = 1
        self.FRvocab = set()
        self.FRword2Index = dict()
        self.FRindex2Word = list()
        self.FRwordFrequency = dict()
        self.FRvocabSize = 1
        self.load_data()
        self.preprocessor()
        self.vocabBuilder()
        self.modifier()
        self.combined_data = list()
        self.combineData()
    
    def load_data(self):
        with open(self.en_location, 'r') as inFile:
            self.en_dataset = inFile.readlines()
            self.en_dataset = self.en_dataset[:200000]
        
        with open(self.fr_location, 'r') as inFile:
            self.fr_dataset = inFile.readlines()
            self.fr_dataset = self.fr_dataset[:200000]
        self.corpusSize = len(self.en_dataset)
    
    def english_tokenizer(self, text):
        return [tok.text for tok in en_spacy.tokenizer(text)]

    def french_tokenizer(self, text):
        return [tok.text for tok in fr_spacy.tokenizer(text)]

    def cleaner(self,sentence):
        """
            replacing !,?,. with . and removing other punctuations
            
            Arguments:
                tokenized corpuse (list)

            Returns:
                cleaned corpus (list)
        """
        import string

        cleaned_corpus = list()

        new_sentence = list()
        for token in sentence:
            if token in string.punctuation or token == '\n':
                continue
            else:
                new_sentence.append(token)


        return new_sentence
    
    def vocabBuilder(self):
        for sentence in self.processed_en_dataset:
            for word in sentence:
                self.ENvocab.add(word)
                if word not in self.ENword2Index:
                    self.ENword2Index[word] = self.ENvocabSize
                    self.ENindex2Word.append(word)
                    self.ENwordFrequency[word] = 1
                    self.ENvocabSize += 1
                
                else:
                    self.ENwordFrequency[word] += 1
        
        for sentence in self.processed_fr_dataset:
            for word in sentence:
                self.FRvocab.add(word)
                if word not in self.FRword2Index:
                    self.FRword2Index[word] = self.FRvocabSize
                    self.FRindex2Word.append(word)
                    self.FRwordFrequency[word] = 1
                    self.FRvocabSize += 1
                
                else:
                    self.FRwordFrequency[word] += 1
    
    def preprocessor(self):
        for sentence in self.en_dataset:
            tokenized_sentence = self.english_tokenizer(sentence)
            cleaned_sentence = self.cleaner(tokenized_sentence)
            normalized_sentence = ['<SOS>']
            for token in cleaned_sentence:
                normalized_sentence.append(token.lower())
            normalized_sentence = normalized_sentence + ['<EOS>']

            self.processed_en_dataset.append(normalized_sentence)

        for sentence in self.fr_dataset:
            tokenized_sentence = self.french_tokenizer(sentence)
            cleaned_sentence = self.cleaner(tokenized_sentence)
            normalized_sentence = ['<SOS>']
            for token in cleaned_sentence:
                normalized_sentence.append(token.lower())
            normalized_sentence = normalized_sentence + ['<EOS>']

            self.processed_fr_dataset.append(normalized_sentence)
    
    def modifier(self):
        for i in range(self.corpusSize):
            for j in range(1, len(self.processed_en_dataset[i]) - 1):
                if self.ENwordFrequency[self.processed_en_dataset[i][j]] < 2:
                    self.processed_en_dataset[i][j] = '<OOV>'
        
        for i in range(self.corpusSize):
            for j in range(1, len(self.processed_fr_dataset[i]) - 1):
                if self.FRwordFrequency[self.processed_fr_dataset[i][j]] < 2:
                    self.processed_fr_dataset[i][j] = '<OOV>'

        self.ENvocab = set()
        self.ENword2Index = dict()
        self.ENindex2Word = list()
        self.ENwordFrequency = dict()
        self.ENvocabSize = 1
        self.FRvocab = set()
        self.FRword2Index = dict()
        self.FRindex2Word = list()
        self.FRwordFrequency = dict()
        self.FRvocabSize = 1

        for sentence in self.processed_en_dataset:
            for word in sentence:
                self.ENvocab.add(word)
                if word not in self.ENword2Index:
                    self.ENword2Index[word] = self.ENvocabSize
                    self.ENindex2Word.append(word)
                    self.ENwordFrequency[word] = 1
                    self.ENvocabSize += 1
                
                else:
                    self.ENwordFrequency[word] += 1
        
        for sentence in self.processed_fr_dataset:
            for word in sentence:
                self.FRvocab.add(word)
                if word not in self.FRword2Index:
                    self.FRword2Index[word] = self.FRvocabSize
                    self.FRindex2Word.append(word)
                    self.FRwordFrequency[word] = 1
                    self.FRvocabSize += 1
                
                else:
                    self.FRwordFrequency[word] += 1
    
    def combineData(self):
        for idx in range(self.corpusSize):
            self.combined_data.append((self.processed_en_dataset[idx], self.processed_fr_dataset[idx]))

    def __len__(self):
        return self.corpusSize
    
    def __getitem__(self, index):
        return (
            np.array([self.ENword2Index[word] for word in self.combined_data[index][0]]),
            np.array([self.FRword2Index[word] for word in self.combined_data[index][1]])
        )
        

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, en_file_location, fr_file_location, sequence_length):
        self.en_file_location = en_file_location
        self.fr_file_location = fr_file_location
        self.sequence_length = sequence_length
        self.initialize_data()
        self.modify()
        self.combined_data = list()
        self.combineData()

    def initialize_data(self):
        with open(self.en_file_location, "r") as inFile:
            enData = inFile.readlines()
        
        with open(self.fr_file_location, "r") as inFile:
            frData = inFile.readlines()

        en_tokenized_data = self.tokenizer(enData)
        self.ENdataset = self.cleaner(en_tokenized_data)
        (
            self.ENword2Index,
            self.ENindex2Word,
            self.ENvocab_size,
            self.ENvocab,
            self.ENwordFrequency
        ) = self.vocabBuilder(self.ENdataset)
        self.ENwords = list()
        for sentence in self.ENdataset:
            for word in sentence:
                self.ENwords.append(word)

        self.ENwords_indexes = [self.ENword2Index[word] for word in self.ENwords]

        fr_tokenized_data = self.tokenizer(frData)
        self.FRdataset = self.cleaner(fr_tokenized_data)
        (
            self.FRword2Index,
            self.FRindex2Word,
            self.FRvocab_size,
            self.FRvocab,
            self.FRwordFrequency
        ) = self.vocabBuilder(self.FRdataset)
        self.FRwords = list()
        for sentence in self.FRdataset:
            for word in sentence:
                self.FRwords.append(word)

        self.FRwords_indexes = [self.FRword2Index[word] for word in self.FRwords]

    def tokenizer(self,corpus):
        """
            tokenizes the corpus
            
            Arguments:
                corpus (list)

            Returns:
                tokenized corpus (list)
        """
        hashtag_regex = "#[a-zA-Z0-9]+"
        url_regex = "((http|https)://)(www.)?[a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)"
        mention_regex = "@\w+"

        processed_corpus = list()

        for tweet in corpus:
            normalized_tweet = tweet.lower()
            hashtag_removed_tweet = re.sub(hashtag_regex, "<HASHTAG>", normalized_tweet)
            website_removed_tweet = re.sub(url_regex, "<URL>", hashtag_removed_tweet)
            mention_removed_tweet = re.sub(
                mention_regex, "<MENTION>", website_removed_tweet
            )
            punctuation_repeat_removed = re.sub(
                r"(\W)(?=\1)", "", mention_removed_tweet
            )
            tokenized_tweet = punctuation_repeat_removed.split()

            cleaned_tokenized_tweet = list()
            for token in tokenized_tweet:
                if token not in ["<HASHTAG>", "<URL>", "<MENTION>", "<OOV>"]:
                    split_tokens = "".join(
                        (char if char.isalpha() or char.isnumeric() else f" {char} ")
                        for char in token
                    ).split()
                    for cleaned_token in split_tokens:
                        cleaned_tokenized_tweet.append(cleaned_token)

                else:
                    cleaned_tokenized_tweet.append(token)
            cleaned_tokenized_tweet = ['<SOS>'] + cleaned_tokenized_tweet + ['<EOS>']
            processed_corpus.append(cleaned_tokenized_tweet)

        return processed_corpus

    def cleaner(self,corpus):
        """
            replacing !,?,. with . and removing other punctuations
            
            Arguments:
                tokenized corpuse (list)

            Returns:
                cleaned corpus (list)
        """
        import string

        cleaned_corpus = list()

        for sentence in corpus:
            new_sentence = list()
            for token in sentence:
                if token in ["!", ".", "?"]:
                    new_sentence.append(".")
                elif token in string.punctuation:
                    continue
                else:
                    new_sentence.append(token)

            cleaned_corpus.append(new_sentence)

        return cleaned_corpus

    def vocabBuilder(self,corpus):
        """
            Builds the vocabulary of the input dataset.

            Arguments:
                The cleaned tokenized the dataset
            
            Returns:
                Word to Index dict, Index to Word list, Number of Unique Words, Set of Vocab
        """
        word2Index = dict()
        index2Word = list()
        vocab = set()
        wordFrequency = dict()

        n_unique_words = 0

        for sentence in corpus:
            for word in sentence:
                vocab.add(word)
                if word not in word2Index:
                    word2Index[word] = n_unique_words
                    index2Word.append(word)
                    n_unique_words += 1
                    wordFrequency[word] = 1
                else:
                    wordFrequency[word] += 1

        return word2Index, index2Word, n_unique_words, vocab, wordFrequency
    
    def modify(self):
        for i in range(len(self.ENdataset)):
            for j in range(len(self.ENdataset[i])):
                if self.ENwordFrequency[self.ENdataset[i][j]] < 2:
                    self.ENdataset[i][j] = '<OOV>'
                elif any(character.isdigit() for character in self.ENdataset[i][j]):
                    self.ENdataset[i][j] = '<OOV>'

        print(self.ENvocab_size)
        
        self.ENdataset = self.cleaner(self.ENdataset)
        (
            self.ENword2Index,
            self.ENindex2Word,
            self.ENvocab_size,
            self.ENvocab,
            self.ENwordFrequency
        ) = self.vocabBuilder(self.ENdataset)
        self.ENwords = list()
        for sentence in self.ENdataset:
            for word in sentence:
                self.ENwords.append(word)

        self.ENwords_indexes = [self.ENword2Index[word] for word in self.ENwords]

        for i in range(len(self.FRdataset)):
            for j in range(len(self.FRdataset[i])):
                if self.FRwordFrequency[self.FRdataset[i][j]] < 2:
                    self.FRdataset[i][j] = '<OOV>'
                elif any(character.isdigit() for character in self.FRdataset[i][j]):
                    self.FRdataset[i][j] = '<OOV>'

        self.FRdataset = self.cleaner(self.FRdataset)
        (
            self.FRword2Index,
            self.FRindex2Word,
            self.FRvocab_size,
            self.FRvocab,
            self.FRwordFrequency
        ) = self.vocabBuilder(self.FRdataset)
        self.FRwords = list()
        for sentence in self.FRdataset:
            for word in sentence:
                self.FRwords.append(word)

        self.FRwords_indexes = [self.FRword2Index[word] for word in self.FRwords]

        print(self.FRvocab_size)

    def combineData(self):
        for idx in range(len(self.ENdataset)):
            self.combined_data.append((self.ENdataset[idx], self.FRdataset[idx]))

    def __len__(self):
        return len(self.FRdataset)

    def __getitem__(self, index):
        return (
            np.array(self.ENwords_indexes[index : index + self.sequence_length]),
            np.array(self.FRwords_indexes[index : index + self.sequence_length])
        )

In [8]:
data = Dataset("./data/ted-talks-corpus/train.en", "./data/ted-talks-corpus/train.fr", 3)

21589
15823


In [9]:
data.combined_data = sorted(data.combined_data, key=lambda x:len(x[0]))

In [10]:
def collate(data):
    X = [x[0] for x in data]
    Y = [y[1] for y in data]

    x_len = max([len(x) for x in X])
    y_len = max([len(y) for y in Y])

    padded_x = np.zeros((BATCH_SIZE, x_len))
    padded_y = np.zeros((BATCH_SIZE, y_len))

    for idx, (x, y) in enumerate(zip(X,Y)):
        padded_x[idx] = numpy.pad(x, (0,x_len - len(x)))
        padded_y[idx] = numpy.pad(y, (0,y_len - len(y))) 
    
    return (
        torch.tensor(padded_x, dtype=torch.long).t().to(device),
        torch.tensor(padded_y, dtype=torch.long).t().to(device)
    )

In [11]:
dataloader = DataLoader(data, shuffle=False, collate_fn=collate, batch_size=BATCH_SIZE, drop_last=True)

In [12]:
next(iter(data))

(array([0, 1, 2]), array([0, 1, 2]))

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout ,output_dim):
        super().__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = nn.Dropout(dropout)
        self.embedding_layer = nn.Embedding(self.input_dim, self.embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, source):
        embedding = self.dropout(self.embedding_layer(source))
        output, (state_h, state_c) = self.lstm(embedding)
        return state_h, state_c

In [14]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding_layer = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, source, state_h, state_c):
        source = source.unsqueeze(0)
        embedding = self.dropout(self.embedding_layer(source))
        output, (state_h, state_c) = self.lstm(embedding, (state_h, state_c))
        pred = self.fc(output.squeeze(0))
        return pred, state_h, state_c

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, ground_truth, force_teaching_ratio=0.5):
        # ground_truth.shape[0] = lenght of the sentence
        # ground_truth.shape[1] = batch_size
        state_h, state_c = self.encoder(source)
        outputs = torch.zeros(ground_truth.shape[0], ground_truth.shape[1], self.decoder.output_dim).to(device)
        decoder_input = ground_truth[0,:]

        for idx in range(1,ground_truth.shape[0]):
            output, state_h, state_c = self.decoder(decoder_input, state_h, state_c)
            outputs[idx] = output
            force = random.random() < force_teaching_ratio
            predicted = output.argmax(1)
            decoder_input = ground_truth[idx] if force else predicted 
        
        return outputs

In [16]:
def train(model, optimizer, criterion, dataloader):
    
    for epoch in range(MAX_EPOCHS):
        model.train().to(device)
        epoch_loss = 0.0
        for x,y in tqdm(dataloader):
            optimizer.zero_grad()
            pred = model(x, y)
            pred = pred[1:].reshape(-1,pred.shape[-1])
            y = y[1:].reshape(-1)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        print({ 'epoch': epoch, 'loss':epoch_loss/len(dataloader) })        

In [17]:
class RNN(nn.Module):
    def __init__(
        self,
        dataset,
        lstm_size=128,
        n_layers=3,
        embedding_dim=128,
    ):
        super(RNN, self).__init__()
        self.vocab_size = dataset.vocab_size
        self.input_dim = lstm_size
        self.embedding_dim = embedding_dim
        self.lstm_hidden_dim = lstm_size
        self.n_layers = n_layers
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size, embedding_dim=self.embedding_dim
        )
        self.lstm = nn.LSTM(
            input_size=self.input_dim,
            hidden_size=self.lstm_hidden_dim,
            num_layers=self.n_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_hidden_dim, self.vocab_size)
        self.output_dim = data.FRvocabSize

    def forward(self, x, prev_state=None):
        if prev_state == None:
            prev_state = self.init_state(1)
        embed = self.embedding_layer(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (
            torch.zeros(self.n_layers, sequence_length, self.lstm_hidden_dim).to(device),
            torch.zeros(self.n_layers, sequence_length, self.lstm_hidden_dim).to(device),
        )

In [18]:
enc = Encoder(12824,EMBEDDING_SIZE, HIDDEN_DIMENSION, LAYERS, 0.5, 12824)
dec = Decoder(15821,EMBEDDING_SIZE, HIDDEN_DIMENSION, LAYERS, 0.5)
enc.load_state_dict(torch.load('./models/encoder_weights.pth'))
dec.load_state_dict(torch.load('./models/decoder_weights.pth'))
model = Seq2Seq(enc, dec)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [19]:
train(model, optimizer, criterion, dataloader)

100%|██████████| 1875/1875 [00:15<00:00, 117.71it/s]


{'epoch': 0, 'loss': 7.575274179077148}


100%|██████████| 1875/1875 [00:16<00:00, 111.39it/s]


{'epoch': 1, 'loss': 6.118623185602824}


100%|██████████| 1875/1875 [00:17<00:00, 106.59it/s]


{'epoch': 2, 'loss': 5.848414396158854}


100%|██████████| 1875/1875 [00:17<00:00, 106.30it/s]


{'epoch': 3, 'loss': 5.734406038792928}


100%|██████████| 1875/1875 [00:18<00:00, 103.78it/s]


{'epoch': 4, 'loss': 5.667627642186483}


100%|██████████| 1875/1875 [00:18<00:00, 101.98it/s]


{'epoch': 5, 'loss': 5.620791695912679}


100%|██████████| 1875/1875 [00:18<00:00, 101.51it/s]


{'epoch': 6, 'loss': 5.574699334462483}


100%|██████████| 1875/1875 [00:18<00:00, 99.77it/s] 


{'epoch': 7, 'loss': 5.531569848378499}


100%|██████████| 1875/1875 [00:18<00:00, 100.50it/s]


{'epoch': 8, 'loss': 5.484510994593302}


100%|██████████| 1875/1875 [00:19<00:00, 97.61it/s] 

{'epoch': 9, 'loss': 5.438163017654419}





In [20]:
torch.save(model, './models/MT2.pth')

In [21]:
print(model)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding_layer): Embedding(12824, 128)
    (lstm): LSTM(128, 128, num_layers=3, dropout=0.5)
    (fc): Linear(in_features=128, out_features=12824, bias=True)
  )
  (decoder): Decoder(
    (embedding_layer): Embedding(15821, 128)
    (lstm): LSTM(128, 128, num_layers=3, dropout=0.5)
    (fc): Linear(in_features=128, out_features=15821, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [22]:
def translate(text):
  model.eval()
  with torch.no_grad():
    tokens = text.split(' ')
    print(tokens)
    for idx in range(len(tokens)):
      if tokens[idx] not in data.ENvocab:
        tokens[idx] = '<OOV>'
    
    tokens = ['<SOS>'] + tokens + ['<EOS>']
    src_indexes = [data.ENword2Index[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_tensor = src_tensor.reshape(-1,1)

    output = model(src_tensor, src_tensor)
    output_dim = output.shape[-1]
    output = output.view(-1, output_dim)
    indices = torch.argmax(output,dim=1).tolist()
    return [data.FRindex2Word[x] for x in indices]

In [23]:
print(translate("and we're going to tell you some stories from the sea"))

['and', "we're", 'going', 'to', 'tell', 'you', 'some', 'stories', 'from', 'the', 'sea']
['<SOS>', 'la', 'est', '.', '<EOS>', '<SOS>', 'la', 'musique', '.', 'la', 'la', 'ce', 'ce']


In [24]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

def calculate_bleu_score(source, target):
    translated = translate(source)[1:]
    target_tokenized = data.french_tokenizer(target)
    score = sentence_bleu([target_tokenized], translated, weights=(0.75,0.25,0,0))
    return score, translated

In [30]:
total_score = 0
candidates, references = list(), list()
scores = []
for source, target in zip(data.ENdataset, data.FRdataset):
    candidates.append(translate(" ".join(source))[1:])
    references.append([target.split(" ")])
    score, translated = calculate_bleu_score(source, target)
    total_score += score
    scores.append(f'{" ".join(translated)}\t{score}\n')

['<SOS>', 'david', 'gallo', 'this', 'is', 'bill', 'lange', '.', 'i', 'm', 'dave', 'gallo', '.', '<EOS>']


AttributeError: 'Dataset' object has no attribute 'french_tokenizer'

In [28]:
print(corpus_bleu(references, candidates, weights=(1,0,0,0)))


ZeroDivisionError: Fraction(0, 0)

In [None]:
print(total_score/20000)

0.0008333777127709976


In [None]:
outFile = open('./bleu_scores/2019115002_MT_train_scores.txt', 'a')

for line in scores:
    outFile.write(line)

In [None]:
with open('./data/ted-talks-corpus/test.en', 'r') as inFile:
    en_dataset = inFile.readlines()
        
with open('./data/ted-talks-corpus/test.fr', 'r') as inFile:
    fr_dataset = inFile.readlines()

In [None]:
total_score = 0
candidates, references = list(), list()
scores = []
for source, target in zip(en_dataset, fr_dataset):
    try:
        candidates.append(translate(source)[1:])
        references.append([data.french_tokenizer(target)])
        score, translated = calculate_bleu_score(source, target)
        total_score += score
        scores.append(f'{" ".join(translated)}\t{score}\n')
    except:
        continue

In [None]:
print(corpus_bleu(references, candidates, weights=(1,0,0,0)))

0.03098522378317546


In [None]:
outFile = open('./bleu_scores/2019115002_MT_test_scores.txt', 'a')

for line in scores:
    outFile.write(line)