In [1]:
from preprocess import *
import numpy as np
import wandb
import torch
import pandas as pd
import os

In [2]:
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" # ":4096:2"
else:
    DEVICE = torch.device('cpu')
print(DEVICE)

cuda


In [3]:
# wrun = wandb.init()

wrun = wandb.Api().artifact('shu7bh/ELMO/best_model_3y4z0vh5:v0', type='model')
# artifact = wrun.use_artifact('shu7bh/ELMO/best_model_3y4z0vh5:v0', type='model')
# artifact_dir = artifact.download()
artifact_dir = wrun.download()
cfg = wandb.Api().run('shu7bh/ELMO/3y4z0vh5').config# 

# import wandb
# wrun = wandb.init()
# artifact = wrun.use_artifact('shu7bh/ELMO/best_model_3y4z0vh5:v0', type='model')
# cfg = wrun.config
# artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact best_model_3y4z0vh5:v0, 569.43MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:1.3


In [4]:
cfg

{'epochs': 100,
 'dropout': 0,
 'optimizer': 'Adam',
 'hidden_dim': 1000,
 'num_layers': 2,
 'dev_train_len': 25000,
 'embedding_dim': 100,
 'learning_rate': 0.001,
 'dev_validation_len': 5000}

In [5]:
DEV_TRAIN_LEN = cfg['dev_train_len']
DEV_VALIDATION_LEN = cfg['dev_validation_len']
HIDDEN_DIM = cfg['hidden_dim']
DROP_OUT = cfg['dropout']
LEARNING_RATE = cfg['learning_rate']
EPOCHS = cfg['epochs']
EMBEDDITNG_DIM = cfg['embedding_dim']
NUM_LAYERS = cfg['num_layers']
BATCH_SIZE = 16
if HIDDEN_DIM in [300, 500]:
    BATCH_SIZE = 32

DIR = '/scratch/shu7bh/RES/PRE'

print(DEV_TRAIN_LEN)
print(DEV_VALIDATION_LEN)
print(HIDDEN_DIM)
print(DROP_OUT)
print(LEARNING_RATE)
print(EPOCHS)
print(EMBEDDITNG_DIM)
print(NUM_LAYERS)
print(BATCH_SIZE)
print(DIR)

25000
5000
1000
0
0.001
100
100
2
16
/scratch/shu7bh/RES/PRE


In [6]:
import os
if not os.path.exists(DIR):
    os.makedirs(DIR)

In [7]:
print(DEV_TRAIN_LEN)
print(DEV_VALIDATION_LEN)

25000
5000


In [8]:
df = pd.read_csv('data/train.csv')
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
df['Description'] = df['Description'].apply(tokenize_corpus)
df['Description'] = df['Description'].apply(get_word_tokenized_corpus)

In [9]:
# dev_train = df[:DEV_TRAIN_LEN]['Description']
dev_validation = df[DEV_TRAIN_LEN:DEV_TRAIN_LEN + DEV_VALIDATION_LEN]['Description']

In [10]:
dev_validation

25000    [linux, is, gaining, ground, with, companies, ...
25001    [google, ,, the, internet, search, engine, ,, ...
25002    [ap, defending, champion, tiger, woods, said, ...
25003    [advanced, micro, devices, will, add, quot, ;,...
25004    [reuters, about, 100, tibetan, exiles, chanted...
                               ...                        
29995    [they, were, 4, 12, last, year, ., they, have,...
29996    [luke, donald, and, paul, casey, are, unlikely...
29997    [yahoo, is, eager, to, improve, its, web, base...
29998    [ankara, strasbourg, ,, france, reuters, prime...
29999    [leipzig, game, convention, in, germany, ,, th...
Name: Description, Length: 5000, dtype: object

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from dataset import SentencesDataset
from elmo import ELMO
from torch import nn

In [12]:
class Collator:
    def __init__(self, Emb):
        self.pad_index = Emb.key_to_index['<pad>']

    def __call__(self, batch):
        X, X_lengths = zip(*batch)
        X = pad_sequence(X, batch_first=True, padding_value=self.pad_index)
        return X[:, :-1], X[:, 1:], torch.stack(X_lengths) - 1

In [13]:
import tqdm

def run_epoch(model, dataloader, loss_fn):
    model.eval()
    epoch_loss = []

    pbar = tqdm.tqdm(dataloader)

    for X, Y, X_lengths in pbar:
        X = X.to(DEVICE)
        Y = Y.to(DEVICE)

        Y_pred = model(X, X_lengths)
        Y_pred = Y_pred.reshape(-1, Y_pred.shape[2])

        Y = Y.reshape(-1)

        loss = loss_fn(Y_pred, Y)
        epoch_loss.append(loss.item())

        pbar.set_description(f'Loss: {loss.item():7.4f}, Avg Loss: {np.mean(epoch_loss):7.4f}')

    return np.mean(epoch_loss)

In [14]:
def validate(elmo, validation_dataloader, loss_fn):
    with torch.no_grad():
        epoch_loss = run_epoch(elmo, validation_dataloader, loss_fn)
        print(f'Validation Loss: {epoch_loss:7.4f}')

In [110]:
Emb = create_vocab(df['Description'], EMBEDDITNG_DIM)

# dev_train_dataset = SentencesDataset(dev_train, Emb)
dev_validation_dataset = SentencesDataset(dev_validation, Emb)

collate_fn = Collator(Emb)

# training_dataloader = DataLoader(dev_train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4)
validation_dataloader = DataLoader(dev_validation_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4)

In [112]:
elmo = ELMO(Emb, HIDDEN_DIM, DROP_OUT, NUM_LAYERS)

In [114]:
# run.file("best_model.pth").download(replace=True, root=DIR)

In [115]:
artifact_dir

'./artifacts/best_model_3y4z0vh5:v0'

In [116]:
elmo.load_state_dict(torch.load(os.path.join(artifact_dir, 'best_model.pth')))
# elmo = elmo.to(DEVICE)
elmo_embeddings = list(elmo.parameters())[0].detach().cpu()
elmo.embedding = nn.Embedding.from_pretrained(elmo_embeddings.to(DEVICE))

In [86]:
elmo = elmo.to(DEVICE)

In [87]:
loss_fn = nn.CrossEntropyLoss(ignore_index=Emb.key_to_index['<pad>'])
validate(elmo, validation_dataloader, loss_fn)

Loss:  7.8234, Avg Loss:  8.0386: 100%|██████████| 313/313 [00:11<00:00, 28.42it/s]

Validation Loss:  8.0386



