In [None]:
## to run in colab

# !git clone https://github.com/sergeychuvakin/advanced_nlp_course.git
# !mv advanced_nlp_course/LM/*.py ./
# !pip install loguru pydantic tokenizers

In [7]:
!pip freeze | egrep "pydantic|torch|loguru|tokenizers|requests|nltk|tqdm"

loguru==0.5.3
nltk==3.6.2
pydantic==1.8.2
requests==2.25.1
requests-oauthlib==1.3.0
tokenizers==0.10.3
torch==1.9.0
torchtext==0.6.0
tqdm==4.59.0


In [1]:
%reload_ext autoreload
%autoreload 2

import torch
from torch.utils.data import DataLoader
from loguru import logger
import sys
import json


from dependencies import corpus, tokenizer
from config import Config, LanguageModelConfig
from processing_utils import (
    clean_text, 
    split_on_sequences, 
    create_ngrams, 
    create_to_x_and_y, 
    word2int,
    create_vocab,
    save_artifacts
)
from model import LM_LSTM
from datahandler import LMDataset
from train_utils import train_model

config = Config()

logger.remove()
logger.add(sys.stderr, level="WARNING")

1

In [None]:
corpus = clean_text(corpus)
corpus = split_on_sequences(corpus)

tcorpus = tokenizer.encode_batch(corpus)
#tcorpus = tuple(map(lambda sentence: tokenizer.encode(sentence), corpus))

## create n-grams for each doc
sq = create_ngrams(tcorpus, config.N_GRAM)

## shift corpus to create x and y 
x, y =  create_to_x_and_y(sq)

id_token, token_id = create_vocab(tokenizer)
vocab_size = len(token_id)

# # for passing to dataloader
# x_int = [word2int(i, token_id) for i in x]
# y_int = [word2int(i, token_id) for i in y]

## split data
tradeoff_index = int(len(x) * config.TRAIN_PROPORTION)

x_train = x[:tradeoff_index]
x_test = x[tradeoff_index:]

y_train = y[:tradeoff_index]
y_test = y[tradeoff_index:]

logger.warning(f"Outpur shapes: x_train: {len(x_train)}, x_test: {len(x_test)}, y_train: {len(y_train)}, y_test: {len(y_test)}")

## load to dataset and dataloader
train_ds = LMDataset(x_train, y_train)
test_ds = LMDataset(x_test, y_test)

train_dl = DataLoader(train_ds, batch_size=config.BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=config.BATCH_SIZE, shuffle=False)

# model and model config
model_config = LanguageModelConfig(vocab_size=vocab_size, emb_size=300)
# with open(config.SAVE_MODEL_CONFIG, "w") as f: 
#     json.dump(model_config.dict(), f)
model = LM_LSTM(**model_config.dict(), logger=logger)

## save artifacts
save_artifacts(
    (model_config.dict(), config.SAVE_MODEL_CONFIG),
    (token_id, config.SAVE_TOKEN_ID),
    (id_token, config.SAVE_ID_TOKEN)
)

optimizer = torch.optim.Adam(model.parameters(), lr=model_config.lr)
loss_func = torch.nn.CrossEntropyLoss()

# train model 
tmodel = train_model(model,
                     train_dl,
                     optimizer=optimizer,
                     loss_func=loss_func,
                     batch_size=config.BATCH_SIZE,
                     epochs=30, 
                     clip=1)

torch.save(model.state_dict(), config.SAVE_MODEL_FNAME)

### Inference

In [13]:
import torch
from config import LanguageModelConfig, Config
from model import LM_LSTM
from loguru import logger
from processing_utils import load_artifact
from dependencies import tokenizer

## usefull utils
logger.remove()
logger.add(sys.stderr, level="WARNING")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## load artifacts
model_config = LanguageModelConfig.parse_file(Config.SAVE_MODEL_CONFIG)
model_config.vocab_size = 12737
token_id = load_artifact(Config.SAVE_TOKEN_ID)
id_token = load_artifact(Config.SAVE_ID_TOKEN)

## load trained model
model = LM_LSTM(**model_config.dict(), logger=logger)
model.load_state_dict(
    torch.load(Config.SAVE_MODEL_FNAME, map_location=device)
)
model.eval()

LM_LSTM(
  (emb_layer): Embedding(12737, 300)
  (lstm): LSTM(300, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=12737, bias=True)
)

In [9]:
tokenizer.encode("hello").ids

[101, 7592, 102]

In [10]:
def transform_raw_word(word, tokenizer):
    int_word = tokenizer.encode("hello").ids
    return torch.tensor([[int_word]])

In [15]:
model(transform_raw_word("hello", tokenizer), None)

RuntimeError: input must have 3 dimensions, got 4

In [60]:
from tokenizers import BertWordPieceTokenizer

help(BertWordPieceTokenizer)

Help on class BertWordPieceTokenizer in module tokenizers.implementations.bert_wordpiece:

class BertWordPieceTokenizer(tokenizers.implementations.base_tokenizer.BaseTokenizer)
 |  BertWordPieceTokenizer(vocab: Union[str, Dict[str, int], NoneType] = None, unk_token: Union[str, tokenizers.AddedToken] = '[UNK]', sep_token: Union[str, tokenizers.AddedToken] = '[SEP]', cls_token: Union[str, tokenizers.AddedToken] = '[CLS]', pad_token: Union[str, tokenizers.AddedToken] = '[PAD]', mask_token: Union[str, tokenizers.AddedToken] = '[MASK]', clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: Union[bool, NoneType] = None, lowercase: bool = True, wordpieces_prefix: str = '##')
 |  
 |  Bert WordPiece Tokenizer
 |  
 |  Method resolution order:
 |      BertWordPieceTokenizer
 |      tokenizers.implementations.base_tokenizer.BaseTokenizer
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, vocab: Union[str, Dict[str, int], NoneType] = None, unk_toke

In [58]:
"[UNK]" in token_id

False

In [59]:
transform_raw_word("hell", token_id)

KeyError: '[UNK]'

In [27]:
# predict next token
def predict(net, tkn, h=None):

    # tensor inputs
    x = np.array([[token2int[tkn]]])
    inputs = torch.from_numpy(x)

    # push to GPU
    inputs = inputs.cuda()

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = net(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data

    p = p.cpu()

    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top 3 values
    top_n_idx = p.argsort()[-3:][::-1]

    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

    # return the encoded value of the predicted char and the hidden state
    return int2token[sampled_token_index], h

In [27]:
# function to generate text
def sample(net, size, prime='it is'):

    # push to GPU
    net.cuda()

    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
        token, h = predict(net, t, h)

    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

device(type='cpu')