In [1]:
import torch
import os
import numpy as np
import preprocess as pp
from config import *

In [2]:
train_len = config['parameters']['train_len']['value']
validation_len = config['parameters']['validation_len']['value']
test_len = config['parameters']['test_len']['value']

In [3]:
train_sents, validation_sents, test_sents = pp.get_sents('Auguste_Maquet.txt', train_len, validation_len, test_len)

5351663
30000
10000
12590
and what will that signify to me?


In [4]:
import os

dir = 'RES/NNLM/TEST'

if not os.path.exists(dir):
    os.makedirs(dir)

In [5]:
from torch.utils.data import Dataset

class SentencesDataset(Dataset):
    def __init__(self, sentences: list, Emb):
        super().__init__()

        self.X = []
        self.Y = []

        for sentence in sentences:
            s = pp.get_sentence_index_pad(sentence, Emb)

            for i in range(5, len(s)):
                self.X.append(s[i - 5:i])
                self.Y.append(s[i])

        self.X = torch.stack(self.X)
        self.Y = torch.stack(self.Y)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [6]:
from torch.utils.data import DataLoader

def get_dataloader(dataset, batch_size, shuffle):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)

In [7]:
def load_data(Emb, batch_size):
    train_dataset = SentencesDataset(train_sents, Emb)
    validation_dataset = SentencesDataset(validation_sents, Emb)
    test_dataset = SentencesDataset(test_sents, Emb)

    train_dataloader = get_dataloader(train_dataset, batch_size, shuffle=True)
    validation_dataloader = get_dataloader(validation_dataset, batch_size, shuffle=True)
    test_dataloader = get_dataloader(test_dataset, batch_size, shuffle=False)

    return train_dataloader, validation_dataloader, test_dataloader

In [8]:
import tqdm
import numpy as np

def run(model, dataloader, train, es, device, loss_fn, optimizer, epoch):
    if train:
        model.train()
    else:
        model.eval()

    epoch_loss = []

    pbar = tqdm.tqdm(dataloader)

    for X, Y in pbar:
        Y_pred = model(X)

        Y = Y.to(device)
        loss = loss_fn(Y_pred, Y)
        epoch_loss.append(loss.item())

        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        pbar.set_description(f'{epoch} {"T" if train else "V"} Loss: {loss.item():7.4f}, Avg Loss: {sum(epoch_loss) / len(epoch_loss):7.4f}, Best Loss: {es.best_loss:7.4f}, Counter: {es.counter}')

    return np.mean(epoch_loss)

In [9]:
def train_epoch(model, train_dataloader, validation_dataloader, es, device, loss_fn, optimizer, epoch):
    train_loss = run(model, train_dataloader, True, es, device, loss_fn, optimizer, epoch)
    with torch.no_grad():
        validation_loss = run(model, validation_dataloader, False, es, device, loss_fn, optimizer, epoch)
    return train_loss, validation_loss

In [10]:
from nnlm import NNLM
from EarlyStopping import EarlyStopping
import torch.nn as nn

def train(train_dataloader, validation_dataloader, cfg, Emb):

    model = NNLM(Emb, cfg.hidden_dim, cfg.dropout, pp.device).to(pp.device)
    print(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = getattr(torch.optim, cfg.optimizer)(model.parameters(), lr=cfg.learning_rate)
    # optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

    es = EarlyStopping(patience=3)

    for epoch in range(cfg.epochs):
        _, validation_loss = train_epoch(model, train_dataloader, validation_dataloader, es, pp.device, loss_fn, optimizer, epoch)
        # Save model
        torch.save(model.state_dict(), os.path.join(dir, f'nnlm_{epoch}.pth'))

        if es(validation_loss, epoch):
            break

    os.rename(os.path.join(dir, f'nnlm_{es.best_model_pth}.pth'), os.path.join(dir, f'best_model.pth'))

    return es.best_loss

In [11]:
from nnlm import NNLM
import tqdm

def run_perplexity(dataloader, best_model, best_pth, Emb):
    best_model.load_state_dict(torch.load(best_pth))
    best_model.eval()

    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        perplexity = []

        current_sentence = ''
        current_pred = []
        current_truth = []

        for X, Y in tqdm.tqdm(dataloader):
            Y_pred = best_model(X)

            for i in range(Y.shape[0]):
                if Y[i].item() == Emb.key_to_index['eos']:
                    if len(current_pred) == 0:
                        continue

                    current_pred = torch.stack(current_pred).to(pp.device)
                    current_truth = torch.tensor(current_truth).to(pp.device)
                    loss = loss_fn(current_pred, current_truth)

                    if torch.exp(loss).item() < 10000:
                        perplexity.append(torch.exp(loss).item())

                    current_sentence = ''
                    current_pred = []
                    current_truth = []

                elif Y[i].item() == Emb.key_to_index['pad'] or Y[i].item() == Emb.key_to_index['sos']:
                    continue
                else:
                    current_sentence += Emb.index_to_key[Y[i].item()] + ' '
                    current_pred.append(Y_pred[i])
                    current_truth.append(Y[i])

        print(f'Perplexity: {np.mean(perplexity)}')
        return np.mean(perplexity)

def get_all_perplexity_vals(test_dataloader, cfg, Emb):
    best_model = NNLM(Emb, cfg.hidden_dim, cfg.dropout, pp.device).to(pp.device)
    best_pth = os.path.join(dir, 'best_model.pth')

    return run_perplexity(test_dataloader, best_model, best_pth, Emb)

In [12]:
# WANDB init
import wandb

def run_everything(config=None):
    with wandb.init(config=config):
        cfg = wandb.config
        Emb = pp.create_vocab(train_sents, cfg.embedding_dim)
        print(len(Emb.key_to_index))

        train_dataloader, validation_dataloader, test_dataloader = load_data(Emb, cfg.batch_size)

        val_loss = train(train_dataloader, validation_dataloader, cfg, Emb)
        wandb.log({'val_loss': val_loss})

        train_perplexity = get_all_perplexity_vals(train_dataloader, cfg, Emb)
        test_perplexity = get_all_perplexity_vals(test_dataloader, cfg, Emb)

        wandb.log({'train_perplexity': train_perplexity})
        wandb.log({'test_perplexity': test_perplexity})

        return val_loss, test_perplexity
    
sweep_id = wandb.sweep(config, project="Assignment_1")
wandb.agent(sweep_id, run_everything, count=20)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: 2165b26r
Sweep URL: https://wandb.ai/shu7bh/nnlm/sweeps/2165b26r


[34m[1mwandb[0m: Agent Starting Run: tp7k1y9s with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 100
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	hidden_dim: 500
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: SGD
[34m[1mwandb[0m: 	test_len: 14000
[34m[1mwandb[0m: 	train_len: 30000
[34m[1mwandb[0m: 	validation_len: 10000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshu7bh[0m. Use [1m`wandb login --relogin`[0m to force relogin


16204
NNLM(
  (model): Sequential(
    (0): Linear(in_features=500, out_features=500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=500, out_features=16204, bias=True)
  )
  (dropout): Dropout(p=0, inplace=False)
)


T Loss:  7.2156, Avg Loss:  7.9760, Best Loss:     inf, Counter: 0: 100%|██████████| 5049/5049 [00:28<00:00, 178.73it/s]
V Loss:  6.9858, Avg Loss:  6.8834, Best Loss:     inf, Counter: 0: 100%|██████████| 1699/1699 [00:08<00:00, 210.55it/s]
T Loss:  6.3389, Avg Loss:  6.6452, Best Loss:  6.8834, Counter: 0: 100%|██████████| 5049/5049 [00:27<00:00, 182.80it/s]
V Loss:  5.4691, Avg Loss:  6.4146, Best Loss:  6.8834, Counter: 0: 100%|██████████| 1699/1699 [00:08<00:00, 212.00it/s]
T Loss:  6.5523, Avg Loss:  6.3263, Best Loss:  6.4146, Counter: 0: 100%|██████████| 5049/5049 [00:27<00:00, 184.05it/s]
V Loss:  6.0220, Avg Loss:  6.1911, Best Loss:  6.4146, Counter: 0: 100%|██████████| 1699/1699 [00:07<00:00, 213.63it/s]
T Loss:  5.9810, Avg Loss:  6.1500, Best Loss:  6.1911, Counter: 0: 100%|██████████| 5049/5049 [00:27<00:00, 183.29it/s]
V Loss:  6.2944, Avg Loss:  6.0516, Best Loss:  6.1911, Counter: 0: 100%|██████████| 1699/1699 [00:08<00:00, 210.55it/s]
T Loss:  5.9182, Avg Loss:  6.03

T Loss:  4.9016, Avg Loss:  5.1228, Best Loss:  5.1196, Counter: 0:  93%|█████████▎| 4674/5049 [00:25<00:02, 182.73it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7f42e1d1ab00>
Traceback (most recent call last):
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/tqdm/std.py", line 1193, in __iter__
    self.close()
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/tqdm/std.py", line 1287, in close
zmq.error.ZMQError: Socket operation on non-socket
Exception ignored in sys.unraisablehook: <built-in function unraisablehook>
Traceback (most recent call last):
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/ipykernel/iostream.py", line 559, in flush
    self.pub_thread.schedule(self._flush)
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/ipykernel/iostream.py", line 251, in schedule
    self._event_pipe.send(b"")
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/

In [None]:
# from nnlm import NNLM

# best_model = NNLM(Emb, cfg.hidden_dim, cfg.dropout, pp.device).to(pp.device)
# best_pth = os.path.join(dir, 'best_model.pth')


In [None]:
# import tqdm
# # test
# def run_perplexity(dataloader, f):
#     best_model.load_state_dict(torch.load(best_pth))
#     best_model.eval()

#     loss_fn = nn.CrossEntropyLoss()

#     with torch.no_grad():
#         # epoch_loss = []
#         perplexity = []

#         current_sentence = ''
#         current_pred = []
#         current_truth = []

#         for X, Y in tqdm.tqdm(dataloader):
#             Y_pred = best_model(X)

#             for i in range(Y.shape[0]):
#                 if Y[i].item() == Emb.key_to_index['eos']:
#                     if len(current_pred) == 0:
#                         continue

#                     current_pred = torch.stack(current_pred).to(pp.device)
#                     current_truth = torch.tensor(current_truth).to(pp.device)
#                     loss = loss_fn(current_pred, current_truth)

#                     if torch.exp(loss).item() < 10000:
#                         perplexity.append(torch.exp(loss).item())
#                         print(f'{current_sentence.strip()}: {perplexity[-1]}', file=f)

#                     current_sentence = ''
#                     current_pred = []
#                     current_truth = []

#                 elif Y[i].item() == Emb.key_to_index['pad'] or Y[i].item() == Emb.key_to_index['sos']:
#                     continue
#                 else:
#                     current_sentence += Emb.index_to_key[Y[i].item()] + ' '
#                     current_pred.append(Y_pred[i])
#                     current_truth.append(Y[i])

#         print(f'Average Perplexity: {np.mean(perplexity)}', file=f)
#         print(f'Average Perplexity: {np.mean(perplexity)}', file=f)

In [None]:
# with open(os.path.join(dir, 'train.txt'), 'w') as f:
#     run_perplexity(train_dataloader, f)

# with open(os.path.join(dir, 'val.txt'), 'w') as f:
#     run_perplexity(validation_dataloader, f)

# with open(os.path.join(dir, 'test.txt'), 'w') as f:
#     run_perplexity(test_dataloader, f)

In [None]:
# best_model.load_state_dict(torch.load(best_pth))
# with torch.no_grad():
#     best_model.eval()
#     query = ['money', 'is', 'the', 'root', 'of']
#     print(*query, sep=' ', end=' ')

#     X = []
#     for word in query:
#         X.append(get_vocab_index(word))

#     while query[-1] != 'eos':
#         Y_pred = best_model(X)

#         # multinomial sampling
#         Y_pred = torch.multinomial(torch.softmax(Y_pred, dim=1), num_samples=1)

#         # Y_pred = torch.argmax(Y_pred, dim=1)
#         query = query[1:] + [Emb.index_to_key[Y_pred[-1].item()]]
#         X = X[1:] + [Y_pred[-1].item()]
#         print(Emb.index_to_key[Y_pred[-1].item()], end=' ')