In [None]:
!pip install matplotlib
!pip install pandas

In [2]:
import re
from sklearn.model_selection import train_test_split
import spacy
import fasttext.util
import gensim
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from gensim.models import FastText
import pickle
import os
import gc
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DATA_DIR = '../../data/'

VAL_SPLIT = 0.2
TEST_SPLIT = 0.1

In [4]:
class FastTextEmbeddingGenerator:
    def __init__(self):
        self.model = None

    def set_model(self, model):
        self.model = model
    
    def get_embedding(self, word):
        if word in self.model:
            embedding = self.model[word]
            return embedding
        else:
            embedding = self.model.get_word_vector(word)
            return embedding
        
# download the model first
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('../cc.en.300.bin')
embedding_gen = FastTextEmbeddingGenerator()
embedding_gen.set_model(ft)

In [5]:
class LanguageModelDataset:
    def __init__(self, file_path, chunk_size=100000):
        self.file_path = file_path
        self.chunk_size = chunk_size
        self.nlp = spacy.load("en_core_web_sm", disable=["ner"])
        self.sentences = self._process_large_file()
        self.train_sentences = None
        self.val_sentences = None
        self.test_sentences = None
        # self.max_sentence_length = max(len(sentence.split()) for sentence in self.sentences)  # Calculate global max sentence length
        self.max_sentence_length = 80  # Calculate global max sentence length

    # @functools.lru_cache(maxsize=None)
    def _process_large_file(self):
        sentences = []
        c = 0
        with open(self.file_path, 'r', encoding='utf-8') as file:
            buffer = ""
            for line in file:

                if c > 100:
                    break

                line = line.strip()
                if line: 

                    if buffer:
                        buffer += " " + line
                    else:
                        buffer = line

                else: 

                    if buffer:
                        temp_sentences = buffer.split(".")
                        for sentence in temp_sentences:
                            sentence = sentence.strip()
                            sentence = re.sub(r"[^a-zA-Z0-9\s]+", '', sentence)
                            sentence = sentence.strip()
                            # preprocess text
                            preprocessed_text = self._preprocess_text(sentence)
                            if preprocessed_text:
                                c += 1
                                sentences.append(preprocessed_text)

                        buffer = ""

            if buffer:
                buffer = buffer.strip()
                buffer = re.sub(r"[^a-zA-Z0-9\s]+", '', buffer)
                temp_sentences = buffer.split(".")
                for sentence in temp_sentences:
                    sentence = sentence.strip()
                    preprocessed_text = self._preprocess_text(sentence)
                    if preprocessed_text:
                        sentences.append(preprocessed_text)
                # sentences.append(self._preprocess_text(buffer))

        # sentences = [sentence for sentence in sentences if sentence != ""]
        return sentences

    # @functools.lru_cache(maxsize=None)
    def _preprocess_text(self, text):
        doc = self.nlp(text)
        sentences = " ".join([token.text for token in doc])
        return sentences

    # @functools.lru_cache(maxsize=None)
    def get_splits(self, val_size=10000, test_size=20000):
        train_sentences, val_test_sentences = train_test_split(self.sentences, test_size=val_size+test_size, shuffle=False, random_state=42)
        test_size = test_size / (val_size + test_size)
        val_sentences, test_sentences = train_test_split(val_test_sentences, test_size=test_size, shuffle=False, random_state=42)
        self.train_sentences = train_sentences
        self.val_sentences = val_sentences
        self.test_sentences = test_sentences
        return train_sentences, val_sentences, test_sentences
    
    def build_vocab(self):
        vocab = set()
        for sentence in self.train_sentences:
            for word in sentence.split():
                vocab.add(word)
        self.vocab = list(vocab)
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

In [6]:
file_path = DATA_DIR + 'Auguste_Maquet.txt'
dataset = LanguageModelDataset(file_path)
print(f"Total number of sentences: {len(dataset.sentences)}")
train_sentences, val_sentences, test_sentences = dataset.get_splits(val_size=VAL_SPLIT, test_size=TEST_SPLIT)

Total number of sentences: 199


In [7]:
print(f"Length of train_sentences: {len(train_sentences)}")
print(f"Length of val_sentences: {len(val_sentences)}")
print(f"Length of test_sentences: {len(test_sentences)}")

Length of train_sentences: 139
Length of val_sentences: 40
Length of test_sentences: 20


In [8]:
dataset.build_vocab() # Build vocabulary

word2idx = dataset.word2idx
idx2word = dataset.idx2word

# add UNK token
word2idx['<UNK>'] = len(word2idx)
idx2word[len(idx2word)] = '<UNK>'
dataset.vocab.append('<UNK>')

# add PAD token
word2idx['<PAD>'] = len(word2idx)
idx2word[len(idx2word)] = '<PAD>'
dataset.vocab.append('<PAD>')

vocab_size = len(word2idx)

print(f"Number of training sentences: {len(train_sentences)}")

Number of training sentences: 139


In [15]:
from tqdm import tqdm

def extract_data(sentences, word2idx, embedding_gen, n_gram=5):
    inputs, targets = [], []

    for sentence in tqdm(sentences, desc="Processing Sentences"):
        try:
            words = sentence.split()

            # Precompute embeddings for all words in the sentence
            embeddings = np.array([embedding_gen.get_embedding(word) if word in word2idx else embedding_gen.get_embedding('<UNK>') for word in words], dtype=np.float32)
            embeddings = torch.tensor(embeddings, dtype=torch.float32).to(device)
            # Create sliding windows of 5-gram context
            for i in range(len(words) - n_gram):
                context_embeddings = embeddings[i:i+n_gram].view(-1)  # Flatten the 5-gram embeddings
                inputs.append(context_embeddings)
                
                target_word = words[i + n_gram]

                if target_word in word2idx:
                    target_idx = word2idx[target_word]
                else:
                    target_idx = word2idx['<UNK>']

                targets.append(target_idx)

        except Exception as e:
            print(f"Error processing sentence: {sentence}")
            print(e)
            continue
    
    print(f"Inputs shape:{len(inputs)}")
    print(f"Targets shape:{len(targets)}")
    inputs = torch.stack(inputs)
    targets = torch.tensor(targets)
    print(f"Inputs shape:{inputs.shape}")
    print(f"Targets shape:{targets.shape}")
    return inputs, targets

In [16]:
# Exclude the sentences that are shorter than 5 words
train_sentences = [sentence for sentence in train_sentences if len(sentence.split()) > 5]

train_inputs, train_targets = extract_data(train_sentences, word2idx, embedding_gen)

Processing Sentences: 100%|██████████| 33/33 [00:00<00:00, 56.90it/s]

Inputs shape:245
Targets shape:245
Inputs shape:torch.Size([245, 1500])
Targets shape:torch.Size([245])





In [17]:
class NeuralLanguageModel(nn.Module):
    def __init__(self, n_gram, vocab_size, embedding_dim=300, hidden_dim=300, dropout=0.2):
        super(NeuralLanguageModel, self).__init__()
        self.fc1 = nn.Linear(embedding_dim * n_gram, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)  # Output is vocab size
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x
        # return self.softmax(x)

model = NeuralLanguageModel(n_gram=5, vocab_size=vocab_size, embedding_dim=300, hidden_dim=300)

In [18]:
val_sentences = [sentence for sentence in val_sentences if len(sentence.split()) > 5]
val_inputs, val_targets = extract_data(val_sentences, word2idx, embedding_gen)

Processing Sentences:   0%|          | 0/4 [00:00<?, ?it/s]

Processing Sentences: 100%|██████████| 4/4 [00:00<00:00, 10.82it/s]

Inputs shape:7
Targets shape:7
Inputs shape:torch.Size([7, 1500])
Targets shape:torch.Size([7])





In [19]:
batch_size = 64
epochs = 10
learning_rate = 0.001

model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5) # L2 regularization

In [None]:
from tqdm import tqdm

def calculate_val_loss(model, val_inputs, val_targets, criterion):
    val_loss = 0
    model.eval()
    with torch.no_grad():
        pbar = tqdm(range(0, len(val_inputs), batch_size), desc="Validation Batches")
        for i in pbar:
            inputs = val_inputs[i:i+batch_size].to(device)
            targets = val_targets[i:i+batch_size].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            pbar.set_postfix({"Current Validation Loss": loss.item()})
    print(f"Val Loss: {val_loss}")
    print(f"Number of validation samples: {len(val_inputs)}")
    print(f"Batch size: {batch_size}")
    if len(val_inputs) < batch_size:
        val_loss /= 1
    else:
        val_loss /= (len(val_inputs)//batch_size)

    return val_loss

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0
    pbar = tqdm(range(0, len(train_inputs), batch_size), desc="Training Batches")
    for i in pbar:
        inputs = train_inputs[i:i+batch_size].to(device)
        targets = train_targets[i:i+batch_size].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        pbar.set_postfix({"Current Training Loss": loss.item()})
    
    train_loss = running_loss / (len(train_inputs)//batch_size)
    val_loss = calculate_val_loss(model, val_inputs, val_targets, criterion)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}")

    torch.cuda.empty_cache()
    gc.collect()

In [14]:
torch.save(model, 'language_model_q1.pth')

In [None]:
# load model
model = torch.load('language_model_q1.pth')
# model.to(device)

In [19]:
torch.cuda.empty_cache()
gc.collect()

0

In [20]:
test_sentences = [sentence for sentence in test_sentences if len(sentence.split()) > 5]
test_inputs, test_targets = extract_data(test_sentences, word2idx, embedding_gen)

Processing Sentences:   0%|          | 0/3 [00:00<?, ?it/s]

Processing Sentences: 100%|██████████| 3/3 [00:00<00:00,  6.44it/s]


In [22]:
# Evaluate on test set

def calculate_test_loss(model, test_inputs, test_targets, criterion):
    test_loss = 0
    model.eval()
    with torch.no_grad():
        pbar = tqdm(range(0, len(test_inputs), batch_size), desc="Test Batches")
        for i in pbar:
            inputs = test_inputs[i:i+batch_size].to(device)
            targets = test_targets[i:i+batch_size].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            pbar.set_postfix({"Current Test Loss": loss.item()})
    
    # return test_loss / (len(test_inputs)//batch_size)

    if len(test_inputs) < batch_size:
        test_loss /= 1
    else:
        test_loss /= (len(test_inputs)//batch_size)

    return test_loss

test_loss = calculate_test_loss(model, test_inputs, test_targets, criterion)
print(f"Test Loss: {test_loss}")

Test Batches: 100%|██████████| 1/1 [00:00<00:00, 59.80it/s, Current Test Loss=6.75]

Test Loss: 6.74643087387085





In [23]:
def calculate_perplexity(model, sentence, word2idx, embedding_gen, criterion):
    model.eval()
    words = sentence.split()
    embeddings = torch.tensor([embedding_gen.get_embedding(word) for word in words], dtype=torch.float32).to(device)
    total_loss = 0
    with torch.no_grad():
        for i in range(len(words) - 5):
            context_embeddings = embeddings[i:i+5].view(-1)
            target_word = words[i+5]
            target_idx = word2idx.get(target_word, word2idx['<UNK>'])
            target = torch.tensor([target_idx]).to(device)
            context_embeddings = context_embeddings.unsqueeze(0)
            context_embeddings = context_embeddings.to(device)
            output = model(context_embeddings)
            loss = criterion(output, target)
            total_loss += loss.item()

    perplexity = np.exp(total_loss / (len(words) - 5))
    return perplexity

def compute_perplexity_average(model, sentences, word2idx, embedding_gen, criterion):
    perplexities = []
    for sentence in sentences:
        perplexity = calculate_perplexity(model, sentence, word2idx, embedding_gen, criterion)
        perplexities.append(perplexity)
    return np.mean(perplexities)

In [24]:
with open('temp_perplexity_sentences_training_q1.txt', 'w') as file:
    for sentence in tqdm(train_sentences, desc="Processing Sentences"):
        perplexity = calculate_perplexity(model, sentence, word2idx, embedding_gen, criterion)
        file.write(f"{sentence}\t{perplexity}\n")

Processing Sentences: 100%|██████████| 33/33 [00:00<00:00, 56.89it/s]


In [25]:
with open('perplexity_sentences_test_q1.txt', 'w') as file:
    for sentence in tqdm(test_sentences, desc="Processing Sentences"):
        perplexity = calculate_perplexity(model, sentence, word2idx, embedding_gen, criterion)
        file.write(f"{sentence}\t{perplexity}\n")

Processing Sentences: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


In [None]:
def train_with_hyperparameters(dropout_rates, hidden_dims, optimizers, criterion, train_X, train_y, val_X, val_y, epochs=10, batch_size=64):
    results = []
    train_perplexities = []
    val_perplexities = []

    for dropout in dropout_rates:
        for hidden_dim in hidden_dims:
            for opt in optimizers:
                model = NeuralLanguageModel(n_gram=5, vocab_size=vocab_size, embedding_dim=300, hidden_dim=hidden_dim).to(device)
                if opt == optimizers[0]:
                    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
                else:
                    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5)

                for epoch in range(epochs):
                    model.train()
                    for i in tqdm(range(0, len(train_X), batch_size), desc=f"Epoch {epoch+1}/{epochs}"):
                        inputs = train_X[i:i+batch_size].to(device)
                        targets = train_y[i:i+batch_size].to(device)
                        
                        optimizer.zero_grad()
                        outputs = model(inputs)
                        loss = criterion(outputs, targets)
                        loss.backward()
                        optimizer.step()
                    
                train_perplexity = compute_perplexity_average(model, train_sentences, word2idx, embedding_gen, criterion)
                val_perplexity = compute_perplexity_average(model, val_sentences, word2idx, embedding_gen, criterion)
                train_perplexities.append(train_perplexity)
                val_perplexities.append(val_perplexity)
                print(f"Train Perplexity: {train_perplexity}, Val Perplexity: {val_perplexity}")
                if opt == optimizers[0]:
                    results.append((dropout, hidden_dim, "Adam", train_perplexity, val_perplexity))
                else:
                    results.append((dropout, hidden_dim, "SGD", train_perplexity, val_perplexity))
    
    return results

dropout_rates = [0.2, 0.3]
hidden_dims = [300, 400]
optimizers = [optim.Adam, optim.SGD]

results = train_with_hyperparameters(dropout_rates, hidden_dims, optimizers, criterion, train_inputs, train_targets, val_inputs, val_targets, epochs=10, batch_size=64)

In [None]:
results_df = pd.DataFrame(results, columns=["Dropout", "Hidden Dim", "Optimizer", "Train Perplexity", "Val Perplexity"])
results_df.to_csv('hyperparameter_results.csv', index=False)

print("Best Hyperparameters")
print(results_df.loc[results_df['Val Perplexity'].idxmin()])
print("Worst Hyperparameters")
print(results_df.loc[results_df['Val Perplexity'].idxmax()])

: 