In [1]:
from part_3_config import config as cfg
import preprocess as pp
import numpy as np
import wandb
import torch
import os

In [2]:
TRAIN_LEN = cfg['parameters']['train_len']['value']
VALIDATION_LEN = cfg['parameters']['validation_len']['value']
TEST_LEN = cfg['parameters']['test_len']['value']

# TRAIN_LEN = 1000
# VALIDATION_LEN = 100
# TEST_LEN = 100

MAX_LEN = cfg['parameters']['max_len']['value']

dir = 'RES/TF/TEST'

In [3]:
train_sents, validation_sents, test_sents = pp.get_sents('Auguste_Maquet.txt', TRAIN_LEN, VALIDATION_LEN, TEST_LEN)

5351663
30000
10000
12590
oh, i don t say that your excellency will do as you please i should be sorry to advise you in the matter.


In [4]:
import os

if not os.path.exists(dir):
    os.makedirs(dir)

In [5]:
from torch.utils.data import Dataset
from gensim.models import KeyedVectors
import torch

class SentencesDataset(Dataset):
    def __init__(self, sentences: list, Emb: KeyedVectors, max_len: int = None):
        super().__init__()

        if max_len is not None:
            SentencesDataset.max_len = max_len + 1

        self.X = []
        self.Y = []

        for sentence in sentences:
            s = pp.get_sentence_index(sentence, Emb)
            max_sentence_len = min(SentencesDataset.max_len, len(s))

            self.X.append(torch.cat((s[:max_sentence_len], torch.empty(SentencesDataset.max_len - max_sentence_len, dtype=torch.long).fill_(Emb.key_to_index['pad']))))

            # self.Y.append(s[max_sentence_len])
            # for i in range(max_sentence_len):
            #     self.X.append(torch.cat((s[:i], torch.empty(SentencesDataset.max_len - i, dtype=torch.long).fill_(Emb.key_to_index['pad']))))
            #     self.Y.append(s[i])

        self.X = torch.stack(self.X)
        # self.Y = torch.stack(self.Y)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        # return self.X[idx], self.Y[idx]
        return self.X[idx]

In [6]:
from torch.utils.data import DataLoader

def get_dataloader(dataset, batch_size, shuffle):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)

In [7]:
def load_data(Emb, batch_size, device, max_len):
    train_dataset = SentencesDataset(train_sents, Emb, max_len)
    validation_dataset = SentencesDataset(validation_sents, Emb)
    test_dataset = SentencesDataset(test_sents, Emb)

    train_dataloader = get_dataloader(train_dataset, batch_size, True)
    validation_dataloader = get_dataloader(validation_dataset, batch_size, True)
    test_dataloader = get_dataloader(test_dataset, batch_size, False)

    return train_dataloader, validation_dataloader, test_dataloader

In [8]:
import tqdm
import numpy as np

def run(model, dataloader, train, es, device, loss_fn, optimizer, epoch):
    if train:
        model.train()
    else:
        model.eval()

    epoch_loss = []

    pbar = tqdm.tqdm(dataloader)

    for X in pbar:
        # print(X[0])
        Y = X[:, 1:]
        X = X[:, :-1]

        # print(X[0])
        # print(Y[0])

        # break

        Y_pred = model(X)
        Y = Y.to(device)

        Y_pred = Y_pred.view(-1, Y_pred.shape[-1])
        Y = Y.view(-1)

        loss = loss_fn(Y_pred, Y)
        epoch_loss.append(loss.item())

        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        pbar.set_description(f'{epoch} {"T" if train else "V"} Loss: {loss.item():7.4f}, Avg Loss: {sum(epoch_loss) / len(epoch_loss):7.4f}, Best Loss: {es.best_loss:7.4f}, Counter: {es.counter}')

    return np.mean(epoch_loss)

In [9]:
import wandb

def train_epoch(model, train_dataloader, validation_dataloader, es, device, loss_fn, optimizer, epoch):
    train_loss = run(model, train_dataloader, True, es, device, loss_fn, optimizer, epoch)
    wandb.log({'train_loss': train_loss})
    with torch.no_grad():
        validation_loss = run(model, validation_dataloader, False, es, device, loss_fn, optimizer, epoch)
        wandb.log({'validation_loss': validation_loss})
    print(f'Epoch {epoch} Train Loss: {train_loss:7.4f}, Validation Loss: {validation_loss:7.4f}')
    return train_loss, validation_loss

In [10]:
from transformer import Decoder
from EarlyStopping import EarlyStopping
import torch.nn as nn

def train(train_dataloader, validation_dataloader, cfg, Emb):

    # nhead = cfg['parameters']['nhead']['value']
    # dim_feedforward = cfg['parameters']['dim_feedforward']['value']
    # num_layers = cfg['parameters']['num_layers']['value']
    # dropout = cfg['parameters']['dropout']['value']
    # max_len = cfg['parameters']['max_len']['value']
    # epochs = cfg['parameters']['epochs']['value']
    # learning_rate = cfg['parameters']['learning_rate']['value']
    # optimizer = cfg['parameters']['optimizer']['value']

    nhead = cfg.nhead
    dim_feedforward = cfg.dim_feedforward
    num_layers = cfg.num_layers
    dropout = cfg.dropout
    max_len = cfg.max_len
    epochs = cfg.epochs
    learning_rate = cfg.learning_rate
    optimizer = cfg.optimizer

    model = Decoder(Emb, nhead, dim_feedforward, num_layers, dropout, max_len, pp.device).to(pp.device)
    # print(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = getattr(torch.optim, optimizer)(model.parameters(), lr=learning_rate)
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    es = EarlyStopping(patience=3)

    for epoch in range(epochs):
        _, validation_loss = train_epoch(model, train_dataloader, validation_dataloader, es, pp.device, loss_fn, optimizer, epoch)
        # Save model
        torch.save(model.state_dict(), os.path.join(dir, f'nnlm_{epoch}.pth'))

        if es(validation_loss, epoch):
            break

    os.rename(os.path.join(dir, f'nnlm_{es.best_model_pth}.pth'), os.path.join(dir, f'best_model.pth'))

    return es.best_loss

In [11]:
from nnlm import NNLM
import tqdm

def run_perplexity(dataloader, best_model, best_pth, Emb):
    best_model.load_state_dict(torch.load(best_pth))
    best_model.eval()

    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        perplexity = []

        pbar = tqdm.tqdm(dataloader)
        for X in pbar:
            Y = X[:, 1:]
            X = X[:, :-1]

            # tgt_key_padding_mask = (X == Emb.key_to_index['pad']).transpose(0, 1)

            Y_pred = best_model(X)
            Y = Y.to(pp.device)

            for i in range(Y_pred.shape[0]):
                pval = 0
                pix = 0
                sentence = ''
                
                # print(X.shape)
                # print(Y_pred.shape)
                # print(Y.shape)

                for j in range(1, X.shape[1]):
                    if X[i][j].item() == Emb.key_to_index['eos']:
                        break
                    sentence += Emb.index_to_key[X[i][j].item()] + ' '
                    pix = j + 1

                # print(sentence.strip())
                # print(Y_pred[i][:pix].shape)
                # print(Y[i][:pix].shape)
                # print(Y_pred[i][:pix])
                # print(Y[i][:pix])

                # print(loss_fn(Y_pred[i][:pix], Y[i][:pix]).item())

                pval = np.exp(loss_fn(Y_pred[i][:pix], Y[i][:pix]).item())
                # print(f'{sentence.strip()}: {pval}', file=f)
                perplexity.append(pval)

            #     break
            # break

        # print(f'Perplexity: {np.mean(perplexity)}')
        return np.mean(perplexity)

def get_all_perplexity_vals(test_dataloader, cfg, Emb):

    # nhead = cfg['parameters']['nhead']['value']
    # dim_feedforward = cfg['parameters']['dim_feedforward']['value']
    # num_layers = cfg['parameters']['num_layers']['value']
    # dropout = cfg['parameters']['dropout']['value']
    # max_len = cfg['parameters']['max_len']['value']

    nhead = cfg.nhead
    dim_feedforward = cfg.dim_feedforward
    num_layers = cfg.num_layers
    dropout = cfg.dropout
    max_len = cfg.max_len

    best_model = Decoder(Emb, nhead, dim_feedforward, num_layers, dropout, max_len, pp.device).to(pp.device)
    # print(best_model)
    best_pth = os.path.join(dir, 'best_model.pth')

    # with open(os.path.join(dir, 'test_perplexity.txt'), 'w') as f:
    return run_perplexity(test_dataloader, best_model, best_pth, Emb)
    
    # return run_perplexity(test_dataloader, best_model, best_pth, Emb)

In [12]:
# WANDB init
import wandb

def run_everything(cfg=None):
    with wandb.init(config=cfg):
        cfg = wandb.config

        # embedding_dim = cfg['parameters']['embedding_dim']['value']
        # batch_size = cfg['parameters']['batch_size']['value']
        # max_len = cfg['parameters']['max_len']['value']

        embedding_dim = cfg.embedding_dim
        batch_size = cfg.batch_size
        max_len = cfg.max_len

        Emb = pp.create_vocab(train_sents, embedding_dim)
        print(len(Emb.key_to_index))

        train_dataloader, validation_dataloader, test_dataloader = load_data(Emb, batch_size, pp.device, max_len)

        val_loss = train(train_dataloader, validation_dataloader, cfg, Emb)
        wandb.log({'best_loss': val_loss})

        train_perplexity = get_all_perplexity_vals(train_dataloader, cfg, Emb)
        test_perplexity = get_all_perplexity_vals(test_dataloader, cfg, Emb)

        wandb.log({'train_perplexity': train_perplexity})
        wandb.log({'test_perplexity': test_perplexity})

sweep_id = wandb.sweep(cfg, project="Transformer")
wandb.agent(sweep_id, run_everything, count=50)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: 6t5kvtay
Sweep URL: https://wandb.ai/shu7bh/Transformer/sweeps/6t5kvtay


[34m[1mwandb[0m: Agent Starting Run: fd41icek with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dim_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 200
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: CrossEntropyLoss
[34m[1mwandb[0m: 	max_len: 50
[34m[1mwandb[0m: 	nhead: 4
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	test_len: 14000
[34m[1mwandb[0m: 	train_len: 30000
[34m[1mwandb[0m: 	validation_len: 10000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshu7bh[0m. Use [1m`wandb login --relogin`[0m to force relogin


16244
Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=512, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=512, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0, inplace=False)
        (dropout2): Dropout(p

0 T Loss:  1.8468, Avg Loss:  2.1582, Best Loss:     inf, Counter: 0: 100%|██████████| 938/938 [00:20<00:00, 45.84it/s]
  return torch._native_multi_head_attention(
0 V Loss:  2.4069, Avg Loss:  1.9215, Best Loss:     inf, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 86.45it/s] 


Epoch 0 Train Loss:  2.1582, Validation Loss:  1.9215


1 T Loss:  1.6935, Avg Loss:  1.8503, Best Loss:  1.9215, Counter: 0: 100%|██████████| 938/938 [00:19<00:00, 48.82it/s]
1 V Loss:  1.6556, Avg Loss:  1.8337, Best Loss:  1.9215, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 91.11it/s] 


Epoch 1 Train Loss:  1.8503, Validation Loss:  1.8337


2 T Loss:  1.9591, Avg Loss:  1.7365, Best Loss:  1.8337, Counter: 0: 100%|██████████| 938/938 [00:18<00:00, 49.47it/s]
2 V Loss:  1.4286, Avg Loss:  1.7962, Best Loss:  1.8337, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 87.02it/s] 


Epoch 2 Train Loss:  1.7365, Validation Loss:  1.7962


3 T Loss:  1.4524, Avg Loss:  1.6504, Best Loss:  1.7962, Counter: 0: 100%|██████████| 938/938 [00:19<00:00, 48.01it/s]
3 V Loss:  2.1360, Avg Loss:  1.7968, Best Loss:  1.7962, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 86.06it/s] 


Epoch 3 Train Loss:  1.6504, Validation Loss:  1.7968


4 T Loss:  1.1378, Avg Loss:  1.5763, Best Loss:  1.7962, Counter: 1: 100%|██████████| 938/938 [00:18<00:00, 50.05it/s]
4 V Loss:  1.4021, Avg Loss:  1.7990, Best Loss:  1.7962, Counter: 1: 100%|██████████| 313/313 [00:03<00:00, 82.37it/s] 


Epoch 4 Train Loss:  1.5763, Validation Loss:  1.7990


5 T Loss:  1.1499, Avg Loss:  1.5119, Best Loss:  1.7962, Counter: 2: 100%|██████████| 938/938 [00:18<00:00, 49.89it/s]
5 V Loss:  1.6905, Avg Loss:  1.8095, Best Loss:  1.7962, Counter: 2: 100%|██████████| 313/313 [00:03<00:00, 89.36it/s] 


Epoch 5 Train Loss:  1.5119, Validation Loss:  1.8095


6 T Loss:  1.3726, Avg Loss:  1.4562, Best Loss:  1.7962, Counter: 3: 100%|██████████| 938/938 [00:19<00:00, 49.29it/s]
6 V Loss:  1.6323, Avg Loss:  1.8323, Best Loss:  1.7962, Counter: 3: 100%|██████████| 313/313 [00:03<00:00, 86.53it/s] 


Epoch 6 Train Loss:  1.4562, Validation Loss:  1.8323
Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=512, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=512, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p

100%|██████████| 938/938 [00:21<00:00, 42.72it/s]


Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=512, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=512, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0, inplace=False)
        (dropout2): Dropout(p=0, in

100%|██████████| 394/394 [00:09<00:00, 42.66it/s]


0,1
best_loss,▁
test_perplexity,▁
train_loss,█▅▄▃▂▂▁
train_perplexity,▁
validation_loss,█▃▁▁▁▂▃

0,1
best_loss,1.79625
test_perplexity,90.12837
train_loss,1.45622
train_perplexity,52.88513
validation_loss,1.83234


[34m[1mwandb[0m: Agent Starting Run: zrhlds67 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dim_feedforward: 2048
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	embedding_dim: 200
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: CrossEntropyLoss
[34m[1mwandb[0m: 	max_len: 50
[34m[1mwandb[0m: 	nhead: 4
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	test_len: 14000
[34m[1mwandb[0m: 	train_len: 30000
[34m[1mwandb[0m: 	validation_len: 10000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


16244
Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.4, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=2048, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
        (linear2): Linear(in_features=2048, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.4, inplace=False)
        (dropout2): D

0 T Loss:  1.6567, Avg Loss:  2.2546, Best Loss:     inf, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.99it/s]
  return torch._native_multi_head_attention(
0 V Loss:  2.1350, Avg Loss:  2.0256, Best Loss:     inf, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 110.82it/s]


Epoch 0 Train Loss:  2.2546, Validation Loss:  2.0256


1 T Loss:  2.2766, Avg Loss:  2.0359, Best Loss:  2.0256, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 49.26it/s]
1 V Loss:  1.7964, Avg Loss:  1.9598, Best Loss:  2.0256, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 110.74it/s]


Epoch 1 Train Loss:  2.0359, Validation Loss:  1.9598


2 T Loss:  1.7111, Avg Loss:  1.9797, Best Loss:  1.9598, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.66it/s]
2 V Loss:  2.0192, Avg Loss:  1.9298, Best Loss:  1.9598, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 111.37it/s]


Epoch 2 Train Loss:  1.9797, Validation Loss:  1.9298


3 T Loss:  2.3892, Avg Loss:  1.9457, Best Loss:  1.9298, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.81it/s]
3 V Loss:  2.3045, Avg Loss:  1.9160, Best Loss:  1.9298, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 106.66it/s]


Epoch 3 Train Loss:  1.9457, Validation Loss:  1.9160


4 T Loss:  1.6891, Avg Loss:  1.9219, Best Loss:  1.9160, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 49.22it/s]
4 V Loss:  1.5383, Avg Loss:  1.9079, Best Loss:  1.9160, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 106.85it/s]


Epoch 4 Train Loss:  1.9219, Validation Loss:  1.9079


5 T Loss:  1.8266, Avg Loss:  1.9036, Best Loss:  1.9079, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.57it/s]
5 V Loss:  1.2864, Avg Loss:  1.8956, Best Loss:  1.9079, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 104.68it/s]


Epoch 5 Train Loss:  1.9036, Validation Loss:  1.8956


6 T Loss:  2.1637, Avg Loss:  1.8893, Best Loss:  1.8956, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.22it/s]
6 V Loss:  1.9505, Avg Loss:  1.8940, Best Loss:  1.8956, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 106.97it/s]


Epoch 6 Train Loss:  1.8893, Validation Loss:  1.8940


7 T Loss:  2.1301, Avg Loss:  1.8769, Best Loss:  1.8940, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.45it/s]
7 V Loss:  2.1939, Avg Loss:  1.8923, Best Loss:  1.8940, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 104.87it/s]


Epoch 7 Train Loss:  1.8769, Validation Loss:  1.8923


8 T Loss:  1.8120, Avg Loss:  1.8660, Best Loss:  1.8923, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.62it/s]
8 V Loss:  2.0343, Avg Loss:  1.8870, Best Loss:  1.8923, Counter: 0: 100%|██████████| 625/625 [00:06<00:00, 103.97it/s]


Epoch 8 Train Loss:  1.8660, Validation Loss:  1.8870


9 T Loss:  1.7911, Avg Loss:  1.8573, Best Loss:  1.8870, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.52it/s]
9 V Loss:  1.9339, Avg Loss:  1.8847, Best Loss:  1.8870, Counter: 0: 100%|██████████| 625/625 [00:06<00:00, 101.95it/s]


Epoch 9 Train Loss:  1.8573, Validation Loss:  1.8847


10 T Loss:  2.6289, Avg Loss:  1.8501, Best Loss:  1.8847, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.16it/s]
10 V Loss:  1.6245, Avg Loss:  1.8813, Best Loss:  1.8847, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 107.63it/s]


Epoch 10 Train Loss:  1.8501, Validation Loss:  1.8813


11 T Loss:  2.5782, Avg Loss:  1.8429, Best Loss:  1.8813, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.99it/s]
11 V Loss:  2.1850, Avg Loss:  1.8836, Best Loss:  1.8813, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 108.31it/s]


Epoch 11 Train Loss:  1.8429, Validation Loss:  1.8836


12 T Loss:  2.6148, Avg Loss:  1.8370, Best Loss:  1.8813, Counter: 1: 100%|██████████| 1875/1875 [00:38<00:00, 48.25it/s]
12 V Loss:  1.6484, Avg Loss:  1.8809, Best Loss:  1.8813, Counter: 1: 100%|██████████| 625/625 [00:05<00:00, 108.72it/s]


Epoch 12 Train Loss:  1.8370, Validation Loss:  1.8809


13 T Loss:  1.7679, Avg Loss:  1.8314, Best Loss:  1.8809, Counter: 0: 100%|██████████| 1875/1875 [00:38<00:00, 48.60it/s]
13 V Loss:  1.8018, Avg Loss:  1.8837, Best Loss:  1.8809, Counter: 0: 100%|██████████| 625/625 [00:05<00:00, 105.01it/s]


Epoch 13 Train Loss:  1.8314, Validation Loss:  1.8837


14 T Loss:  1.9653, Avg Loss:  1.8258, Best Loss:  1.8809, Counter: 1: 100%|██████████| 1875/1875 [00:38<00:00, 48.71it/s]
14 V Loss:  2.3570, Avg Loss:  1.8854, Best Loss:  1.8809, Counter: 1: 100%|██████████| 625/625 [00:05<00:00, 106.00it/s]


Epoch 14 Train Loss:  1.8258, Validation Loss:  1.8854


15 T Loss:  1.8534, Avg Loss:  1.8215, Best Loss:  1.8809, Counter: 2: 100%|██████████| 1875/1875 [00:38<00:00, 48.23it/s]
15 V Loss:  1.4732, Avg Loss:  1.8821, Best Loss:  1.8809, Counter: 2: 100%|██████████| 625/625 [00:06<00:00, 101.40it/s]


Epoch 15 Train Loss:  1.8215, Validation Loss:  1.8821


16 T Loss:  2.1245, Avg Loss:  1.8171, Best Loss:  1.8809, Counter: 3: 100%|██████████| 1875/1875 [00:38<00:00, 48.31it/s]
16 V Loss:  2.5289, Avg Loss:  1.8726, Best Loss:  1.8809, Counter: 3: 100%|██████████| 625/625 [00:07<00:00, 84.90it/s] 


Epoch 16 Train Loss:  1.8171, Validation Loss:  1.8726


17 T Loss:  2.4266, Avg Loss:  1.8124, Best Loss:  1.8726, Counter: 0: 100%|██████████| 1875/1875 [00:40<00:00, 46.03it/s]
17 V Loss:  2.4828, Avg Loss:  1.8819, Best Loss:  1.8726, Counter: 0: 100%|██████████| 625/625 [00:06<00:00, 96.63it/s] 


Epoch 17 Train Loss:  1.8124, Validation Loss:  1.8819


18 T Loss:  1.3710, Avg Loss:  1.8095, Best Loss:  1.8726, Counter: 1: 100%|██████████| 1875/1875 [00:41<00:00, 45.52it/s]
18 V Loss:  1.9615, Avg Loss:  1.8817, Best Loss:  1.8726, Counter: 1: 100%|██████████| 625/625 [00:06<00:00, 97.68it/s] 


Epoch 18 Train Loss:  1.8095, Validation Loss:  1.8817


19 T Loss:  1.5485, Avg Loss:  1.8059, Best Loss:  1.8726, Counter: 2: 100%|██████████| 1875/1875 [00:40<00:00, 46.81it/s]
19 V Loss:  1.6692, Avg Loss:  1.8805, Best Loss:  1.8726, Counter: 2: 100%|██████████| 625/625 [00:06<00:00, 103.40it/s]


Epoch 19 Train Loss:  1.8059, Validation Loss:  1.8805


20 T Loss:  2.0064, Avg Loss:  1.8025, Best Loss:  1.8726, Counter: 3: 100%|██████████| 1875/1875 [00:38<00:00, 48.13it/s]
20 V Loss:  1.5193, Avg Loss:  1.8841, Best Loss:  1.8726, Counter: 3: 100%|██████████| 625/625 [00:05<00:00, 105.19it/s]


Epoch 20 Train Loss:  1.8025, Validation Loss:  1.8841
Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.4, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=2048, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
        (linear2): Linear(in_features=2048, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dr

100%|██████████| 1875/1875 [00:26<00:00, 71.30it/s]


Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.4, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=2048, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
        (linear2): Linear(in_features=2048, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.4, inplace=False)
        (dropout2): Dropout

100%|██████████| 787/787 [00:11<00:00, 68.25it/s]


0,1
best_loss,▁
test_perplexity,▁
train_loss,█▅▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
train_perplexity,▁
validation_loss,█▅▄▃▃▂▂▂▂▂▁▂▁▂▂▁▁▁▁▁▂

0,1
best_loss,1.87263
test_perplexity,122.70458
train_loss,1.80247
train_perplexity,63.41482
validation_loss,1.88406


[34m[1mwandb[0m: Agent Starting Run: ajz2f6n2 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dim_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 100
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	loss: CrossEntropyLoss
[34m[1mwandb[0m: 	max_len: 50
[34m[1mwandb[0m: 	nhead: 4
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: 	test_len: 14000
[34m[1mwandb[0m: 	train_len: 30000
[34m[1mwandb[0m: 	validation_len: 10000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


16244


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=512, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=512, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0, inplace=False)
        (dropout2): Dropout(p=0, in

0 T Loss:  2.1434, Avg Loss:  3.2927, Best Loss:     inf, Counter: 0:   4%|▎         | 34/938 [00:01<00:21, 41.68it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7f3d54f12c30>
Traceback (most recent call last):
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/tqdm/std.py", line 1193, in __iter__
    self.close()
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/tqdm/std.py", line 1287, in close
    fp_write('')
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/tqdm/std.py", line 1284, in fp_write
    self.fp.write(str(s))
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/tqdm/utils.py", line 127, in inner
    return func(*args, **kwargs)
  zmq.error.ZMQError: Socket operation on non-socket
Exception ignored in sys.unraisablehook: <built-in function unraisablehook>
Traceback (most recent call last):
  File "/home2/shu7bh/miniconda3/envs/main/lib/python3.11/site-packages/ipykernel/iost

In [None]:
# run_everything(cfg)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshu7bh[0m. Use [1m`wandb login --relogin`[0m to force relogin


16244
Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): D

0 T Loss:  1.8565, Avg Loss:  2.2712, Best Loss:     inf, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 42.79it/s]
  return torch._native_multi_head_attention(
0 V Loss:  2.1339, Avg Loss:  1.9637, Best Loss:     inf, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 84.72it/s] 


Epoch 0 Train Loss:  2.2712, Validation Loss:  1.9637


1 T Loss:  1.6589, Avg Loss:  1.9537, Best Loss:  1.9637, Counter: 0: 100%|██████████| 938/938 [00:20<00:00, 44.82it/s]
1 V Loss:  2.4867, Avg Loss:  1.8854, Best Loss:  1.9637, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 87.93it/s] 


Epoch 1 Train Loss:  1.9537, Validation Loss:  1.8854


2 T Loss:  1.7063, Avg Loss:  1.8783, Best Loss:  1.8854, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 44.65it/s]
2 V Loss:  2.2040, Avg Loss:  1.8460, Best Loss:  1.8854, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 87.25it/s] 


Epoch 2 Train Loss:  1.8783, Validation Loss:  1.8460


3 T Loss:  1.4080, Avg Loss:  1.8269, Best Loss:  1.8460, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 44.16it/s]
3 V Loss:  1.6040, Avg Loss:  1.8170, Best Loss:  1.8460, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 83.64it/s] 


Epoch 3 Train Loss:  1.8269, Validation Loss:  1.8170


4 T Loss:  1.5742, Avg Loss:  1.7869, Best Loss:  1.8170, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 44.66it/s]
4 V Loss:  1.5240, Avg Loss:  1.8022, Best Loss:  1.8170, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 88.82it/s] 


Epoch 4 Train Loss:  1.7869, Validation Loss:  1.8022


5 T Loss:  2.0394, Avg Loss:  1.7555, Best Loss:  1.8022, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 43.43it/s]
5 V Loss:  1.6512, Avg Loss:  1.7921, Best Loss:  1.8022, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 87.46it/s] 


Epoch 5 Train Loss:  1.7555, Validation Loss:  1.7921


6 T Loss:  1.7619, Avg Loss:  1.7283, Best Loss:  1.7921, Counter: 0: 100%|██████████| 938/938 [00:20<00:00, 45.01it/s]
6 V Loss:  1.7325, Avg Loss:  1.7899, Best Loss:  1.7921, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 89.97it/s] 


Epoch 6 Train Loss:  1.7283, Validation Loss:  1.7899


7 T Loss:  1.6470, Avg Loss:  1.7046, Best Loss:  1.7899, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 44.46it/s]
7 V Loss:  1.9572, Avg Loss:  1.7870, Best Loss:  1.7899, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 93.30it/s] 


Epoch 7 Train Loss:  1.7046, Validation Loss:  1.7870


8 T Loss:  1.7151, Avg Loss:  1.6844, Best Loss:  1.7870, Counter: 0: 100%|██████████| 938/938 [00:21<00:00, 44.61it/s]
8 V Loss:  1.9002, Avg Loss:  1.7872, Best Loss:  1.7870, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 86.72it/s] 


Epoch 8 Train Loss:  1.6844, Validation Loss:  1.7872


9 T Loss:  1.4786, Avg Loss:  1.6678, Best Loss:  1.7870, Counter: 1: 100%|██████████| 938/938 [00:20<00:00, 45.20it/s]
9 V Loss:  2.3662, Avg Loss:  1.7888, Best Loss:  1.7870, Counter: 1: 100%|██████████| 313/313 [00:03<00:00, 92.78it/s] 


Epoch 9 Train Loss:  1.6678, Validation Loss:  1.7888


10 T Loss:  2.1462, Avg Loss:  1.6521, Best Loss:  1.7870, Counter: 2: 100%|██████████| 938/938 [00:20<00:00, 44.83it/s]
10 V Loss:  1.4500, Avg Loss:  1.7863, Best Loss:  1.7870, Counter: 2: 100%|██████████| 313/313 [00:03<00:00, 89.57it/s] 


Epoch 10 Train Loss:  1.6521, Validation Loss:  1.7863


11 T Loss:  1.3926, Avg Loss:  1.6385, Best Loss:  1.7863, Counter: 0: 100%|██████████| 938/938 [00:20<00:00, 44.77it/s]
11 V Loss:  1.6273, Avg Loss:  1.7903, Best Loss:  1.7863, Counter: 0: 100%|██████████| 313/313 [00:03<00:00, 91.56it/s] 


Epoch 11 Train Loss:  1.6385, Validation Loss:  1.7903


12 T Loss:  2.0145, Avg Loss:  1.6270, Best Loss:  1.7863, Counter: 1: 100%|██████████| 938/938 [00:20<00:00, 44.83it/s]
12 V Loss:  2.3817, Avg Loss:  1.7931, Best Loss:  1.7863, Counter: 1: 100%|██████████| 313/313 [00:03<00:00, 91.87it/s] 


Epoch 12 Train Loss:  1.6270, Validation Loss:  1.7931


13 T Loss:  1.8187, Avg Loss:  1.6173, Best Loss:  1.7863, Counter: 2: 100%|██████████| 938/938 [00:21<00:00, 44.53it/s]
13 V Loss:  1.6690, Avg Loss:  1.7945, Best Loss:  1.7863, Counter: 2: 100%|██████████| 313/313 [00:03<00:00, 92.87it/s] 


Epoch 13 Train Loss:  1.6173, Validation Loss:  1.7945


14 T Loss:  1.2294, Avg Loss:  1.6061, Best Loss:  1.7863, Counter: 3: 100%|██████████| 938/938 [00:20<00:00, 45.60it/s]
14 V Loss:  1.6619, Avg Loss:  1.7976, Best Loss:  1.7863, Counter: 3: 100%|██████████| 313/313 [00:03<00:00, 84.71it/s] 


Epoch 14 Train Loss:  1.6061, Validation Loss:  1.7976
Decoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dr

100%|██████████| 394/394 [00:10<00:00, 38.32it/s]


0,1
best_loss,▁
test_perplexity,▁
train_loss,█▅▄▃▃▃▂▂▂▂▁▁▁▁▁
validation_loss,█▅▃▂▂▁▁▁▁▁▁▁▁▁▁

0,1
best_loss,1.78628
test_perplexity,98.46982
train_loss,1.60607
validation_loss,1.79763


In [None]:
# from part_3_config import config as cfg
# import preprocess as pp

# embedding_dim = cfg['parameters']['embedding_dim']['value']
# batch_size = cfg['parameters']['batch_size']['value']
# max_len = cfg['parameters']['max_len']['value']

# Emb = pp.create_vocab(train_sents, embedding_dim)
# train_dataloader, validation_dataloader, test_dataloader = load_data(Emb, batch_size, pp.device, max_len)

In [None]:
# get_all_perplexity_vals(test_dataloader, cfg, Emb)

In [None]:
# nhead = cfg['parameters']['nhead']['value']
# dim_feedforward = cfg['parameters']['dim_feedforward']['value']
# num_layers = cfg['parameters']['num_layers']['value']
# dropout = cfg['parameters']['dropout']['value']
# max_len = cfg['parameters']['max_len']['value']

# best_model = Decoder(Emb, nhead, dim_feedforward, num_layers, dropout, max_len, pp.device).to(pp.device)
# print(best_model)
# best_pth = os.path.join(dir, 'best_model.pth')

# # generate a sentence from the model

# q = 'my name is '
# print(q, end=' ')

# best_model.load_state_dict(torch.load(best_pth))
# best_model.eval()

# with torch.no_grad():
#     for i in range(10):
#         e = pp.get_sentence_index(q, Emb)
#         e = e[:-1]
#         # print(e)
#         X = torch.cat((e, torch.empty(max_len - len(e), dtype=torch.long).fill_(Emb.key_to_index['pad'])))
#         # print (X)
#         Y_pred = best_model(X)
#         # print(Y_pred.shape)
#         Y_pred = Y_pred[0][len(e) - 1]
#         Y_pred = torch.softmax(Y_pred, dim=-1)
#         # Y_pred = torch.multinomial(Y_pred, num_samples=1)
#         Y_pred = torch.argmax(Y_pred)


#         # print(Y_pred.shape)

#         # for j in range(len(e)):
#         #     a = torch.softmax(Y_pred[0, j], dim=0)
#         #     # b = torch.multinomial(a, num_samples=1)
#         #     b = torch.argmax(a)
#         #     print(Emb.index_to_key[e[j].item()], Emb.index_to_key[b.item()])
#         # print (Y_pred)
#         # Y_pred = Y_pred[0, len(e) - 1]
#         # # Y_pred[len(e)]
#         # Y_pred = torch.softmax(Y_pred, dim=-1)
#         # print(Y_pred)
#         # Y_pred = torch.multinomial(Y_pred, num_samples=1)

#         q += ' ' + Emb.index_to_key[Y_pred.item()]
#         # print the word
#         print(Emb.index_to_key[Y_pred.item()], end=' ')