In [None]:
## to run in colab

# !git clone https://github.com/sergeychuvakin/advanced_nlp_course.git
# !mv advanced_nlp_course/LM/*.py ./
# !mv advanced_nlp_course/LM/*.json ./ ## 
# !pip install loguru pydantic tokenizers

In [132]:
!pip freeze | egrep "pydantic|torch|loguru|tokenizers|requests|nltk|tqdm"

loguru==0.5.3
nltk==3.6.2
pydantic==1.8.2
requests==2.25.1
requests-oauthlib==1.3.0
tokenizers==0.10.3
torch==1.9.0
torchtext==0.6.0
tqdm==4.59.0


In [135]:
%reload_ext autoreload
%autoreload 2

import torch
from torch.utils.data import DataLoader
from loguru import logger
import sys
import json


from dependencies import corpus, tokenizer
from config import Config, LanguageModelConfig
from processing_utils import (
    clean_text, 
    split_on_sequences, 
    create_ngrams, 
    create_to_x_and_y, 
    word2int,
    create_vocab,
    save_artifacts
)
from model import LM_LSTM
from datahandler import LMDataset
from train_utils import train_model

config = Config()

logger.remove()
logger.add(sys.stderr, level="WARNING")

19

In [None]:
corpus = clean_text(corpus)
corpus = split_on_sequences(corpus)

tcorpus = tokenizer.encode_batch(corpus)

## create n-grams for each doc
sq = create_ngrams(tcorpus, config.N_GRAM) 
 
## shift corpus to create x and y 
x, y =  create_to_x_and_y(sq)

id_token, token_id = create_vocab(tokenizer)
vocab_size = len(token_id)

## split data
tradeoff_index = int(len(x) * config.TRAIN_PROPORTION)

x_train = x[:tradeoff_index]
x_test = x[tradeoff_index:]

y_train = y[:tradeoff_index]
y_test = y[tradeoff_index:]

logger.warning(
    f"""
    Output shapes: 
        x_train: {len(x_train)}, 
        x_test: {len(x_test)}, 
        y_train: {len(y_train)}, 
        y_test: {len(y_test)}
    """
              )

## load to dataset and dataloader
train_ds = LMDataset(x_train, y_train)
test_ds = LMDataset(x_test, y_test)

train_dl = DataLoader(train_ds, batch_size=config.BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=config.BATCH_SIZE, shuffle=False)

# model and model config
model_config = LanguageModelConfig(vocab_size=vocab_size, emb_size=300)
model = LM_LSTM(**model_config.dict(), logger=logger)

## save artifacts
save_artifacts(
    (model_config.dict(), config.SAVE_MODEL_CONFIG),
    (token_id, config.SAVE_TOKEN_ID),
    (id_token, config.SAVE_ID_TOKEN)
)

optimizer = torch.optim.Adam(model.parameters(), lr=model_config.lr)
loss_func = torch.nn.CrossEntropyLoss()


# train model 
tmodel = train_model(model,
                     train_dl,
                     optimizer=optimizer,
                     loss_func=loss_func,
                     epochs=5, 
                     clip=1)

torch.save(model.state_dict(), config.SAVE_MODEL_FNAME)

### Validation

In [146]:
logger.warning(
    """
    Cross-Entropy: %f
    Perpelxity: %f
    """ % 
    val_metrics(model, test_dl, token_id))

    Cross-Entropy: 10.326239
    Perpelxity: 30523.087891
    


### Inference

In [88]:
import torch
from config import LanguageModelConfig, Config
from model import LM_LSTM
from loguru import logger
from processing_utils import load_artifact
from dependencies import tokenizer

## usefull utils
logger.remove()
logger.add(sys.stderr, level="WARNING")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## load artifacts
model_config = LanguageModelConfig.parse_file(Config.SAVE_MODEL_CONFIG)
token_id = load_artifact(Config.SAVE_TOKEN_ID)
id_token = load_artifact(Config.SAVE_ID_TOKEN)

## load trained model
model = LM_LSTM(**model_config.dict(), logger=logger)
model.load_state_dict(
    torch.load(Config.SAVE_MODEL_FNAME, map_location=device)
)
model.eval()

LM_LSTM(
  (emb_layer): Embedding(30523, 300)
  (lstm): LSTM(300, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=30523, bias=True)
)

In [89]:
from random import sample
def _transform_raw_word(word, token_id):
    int_word = token_id.get(word, token_id.get(Config.TOKEN_UNKNOWN))
    return torch.tensor([[int_word]]).to(device)

def _get_model_output(model, input_tensor, hidden_state=None):
    if not hidden_state:
        return model(input_tensor, model.init_state(1))
    else:
        return model(input_tensor, hidden_state)

def _transform_model_output(out, random, id_token, top=3):
    
    if random:
        idx = sample(list(torch.topk(out, 3).indices[0]), 1)[0].item()
        
    else:
        idx = torch.topk(out, top).indices[0][0].item()
    return id_token[str(idx)]

    

def predict_one_word(
    word, 
    model, 
    token_id, 
    id_token,
    random=True
):
    
    input_tensor = _transform_raw_word(word, token_id)
    out, h = _get_model_output(model, input_tensor)
    return _transform_model_output(out, random, id_token)

def predict_sample(
    word, 
    model, 
    token_id, 
    id_token,
    length_of_sample, 
    random=True
):
    result = []
    h = None
    while len(result) < length_of_sample:
        
        input_tensor = _transform_raw_word(word, token_id)
        word, h = _get_model_output(model, input_tensor, h)
        result.append(_transform_model_output(word, random, id_token))
    return " ".join(result)

In [101]:
predict_one_word(
    "one of the",  
    model, 
    token_id, 
    id_token,
    random=False
)

','

In [106]:
predict_sample(
    "one", 
    model, 
    token_id, 
    id_token,
    10, 
    random=True
)

'. the , . , , , the , the'