In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle

import torch
import torch.nn as nn
from torch.optim import lr_scheduler

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [42]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "models/ELMo/options.json"
weight_file = "models/ELMo/model.hdf5"

# Compute two different representation for each token.
# Each representation is a linear weighted combination for the
# 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))
elmo = Elmo(options_file, weight_file, 2)

In [43]:
# use batch_to_ids to convert sentences to character ids
sentences = ["Привет, мир!", "Привет, мир!"]
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)

In [45]:
embeddings['elmo_representations']

[tensor([[[-0.3252,  0.1442, -0.0000,  ..., -0.0000, -0.0624,  0.1152],
          [-0.0000,  0.0000, -0.0000,  ..., -0.0000,  0.0000,  0.0000],
          [-0.0000, -0.4533, -1.5221,  ..., -0.0000,  0.0000, -0.0000],
          ...,
          [-1.0777,  0.0000, -1.7911,  ..., -0.1676,  0.5619, -0.5261],
          [-0.0000,  0.0000, -2.1429,  ...,  0.0000,  0.0000, -0.2931],
          [-0.6248,  0.6789, -1.4321,  ..., -0.0000,  0.0000, -0.4447]],
 
         [[-0.0000,  0.1442, -0.2452,  ..., -0.0000, -0.0624,  0.0000],
          [-0.0000,  0.9385, -1.2484,  ..., -0.8439,  0.0000,  0.3235],
          [-0.0000, -0.4533, -0.0000,  ..., -0.0000,  0.0000, -0.1018],
          ...,
          [-0.0000,  0.3273, -0.0000,  ..., -0.1676,  0.5619, -0.5261],
          [-0.0000,  1.1391, -0.0000,  ...,  0.0000,  0.0000, -0.2931],
          [-0.6248,  0.6789, -1.4321,  ..., -0.7854,  0.0000, -0.0000]]],
        grad_fn=<MulBackward0>),
 tensor([[[-0.0000,  0.1442, -0.2452,  ..., -0.9485, -0.0624,  0.000

In [15]:
elmo_intro = """
Extensive experiments demonstrate that ELMo representations work extremely well in practice.
We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.
The addition of ELMo representations alone significantly improves the state of the art in every case, including up to 20% relative error reductions.
For tasks where direct comparisons are possible, ELMo outperforms CoVe (McCann et al., 2017), which computes contextualized representations using a neural machine translation encoder.
Finally, an analysis of both ELMo and CoVe reveals that deep representations outperform those derived from just the top layer of an LSTM.
Our trained models and code are publicly available, and we expect that ELMo will provide similar gains for many other NLP problems.
"""

In [24]:
len(elmo_intro)

898

In [23]:
batch_to_ids([elmo_intro]).size()

torch.Size([1, 898, 50])

In [25]:
class DatasetClass(torch.utils.data.Dataset):
    """
    Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all
    sequences to the max length.
    The SentenceBertEncoder.smart_batching_collate is required for this to work.
    SmartBatchingDataset does *not* work without it.
    """
    def __init__(self, features: np.array, target: np.array):
        """
        Create a new SentencesDataset with the tokenized texts and the labels as Tensor
        """
        self.tokenizer = batch_to_ids
        self.features = features
        self.target = target

    def collate_fn(self, batch):
        """
        Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
        :param batch:
            a batch from a SmartBatchingDataset
        :return:
            a batch of tensors for the model
        """
        features, labels = [], []
        for feature, label in batch:
            features.append(feature)
            labels.append(label)

        return self.tokenizer(features), torch.argmax(torch.tensor(labels, dtype=torch.long), dim=1)

    def __getitem__(self, item):
        return self.features[item], self.target[item]

    def __len__(self):
        return len(self.features)

In [27]:
df = pd.read_parquet('data/val.parquet')

ds = DatasetClass(features=df['question'].to_numpy(), target=df.drop('question', axis=1).to_numpy())

In [29]:
loader = torch.utils.data.DataLoader(ds, collate_fn=ds.collate_fn,
                                     batch_size=2, num_workers=4, shuffle=True)

In [70]:
for batch in loader:
    #print(batch[0])
    #print(batch[1])
    print(elmo(batch[0])['elmo_representations'][0].size())
    break

torch.Size([2, 481, 1024])


In [67]:
elmo(x)['elmo_representations'][0].size()

torch.Size([2, 139, 1024])

In [48]:
elmo._elmo_lstm.get_output_dim()

1024

In [54]:
from argparse import Namespace

def get_config(workdir=None, num_classes=None, weigths=0, batch_size=16):
    return Namespace(num_epochs = 5,
                    batch_size = batch_size,
                    dropout_prob=0.1,
                    num_classes=num_classes,
                    lr=3e-5,
                    weigths=torch.tensor(weigths),
                    options_file = "models/ELMo/options.json",
                    weight_file = "models/ELMo/model.hdf5",
                    train_file_path = workdir+'data/train.parquet',
                    val_file_path = workdir+'data/val.parquet')

In [72]:
class ElmoPooler(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.Tanh()

        nn.init.xavier_normal(self.dense.weight)
        nn.init.normal_(self.dense.bias, std=0.3)

    def forward(self, elmo_repr):
        # pass elmo_repr and mean pool all tokens
        first_token_tensor = self.pool(elmo_repr.permute(0, 2, 1)).squeeze()
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [75]:
class ModelClass(pl.LightningModule):
    def __init__(self, hparams):
        super(ModelClass, self).__init__()
        self.hparams = hparams

        #define model layers
        self.elmo = Elmo(hparams.options_file, hparams.weight_file, num_output_representations=1, requires_grad=True, 
                         do_layer_norm=True)
        self.pooler = ElmoPooler(self.elmo._elmo_lstm.get_output_dim())
        self.drop = nn.Dropout(hparams.dropout_prob)
        self.lin = nn.Linear(self.elmo._elmo_lstm.get_output_dim(), hparams.num_classes)
        #define loss, metric and softmax
        self.soft = nn.Softmax(dim=1)
        self.loss_fn = nn.CrossEntropyLoss(weight=hparams.weigths, ignore_index=0) #ignoring PAD index
        self.auc = roc_auc_score
    
    def forward(self, input):
        #get sentence embeddings
        embs = self.pooler(self.elmo(input)['elmo_representations'][0])
        logits = nn.functional.leaky_relu(self.lin(self.drop(embs)))

        print(logits.size())
        return logits
    
    def prepare_data(self):
        df_train = pd.read_parquet(self.hparams.train_file_path)
        df_val = pd.read_parquet(self.hparams.val_file_path)

        self.train_ds = DatasetClass(features=df_train['question'].to_numpy(), target=df_train.drop('question', axis=1).to_numpy())
        self.val_ds = DatasetClass(features=df_val['question'].to_numpy(), target=df_val.drop('question', axis=1).to_numpy())

        self.num_train_steps = int(len(self.train_ds) / self.hparams.batch_size * self.hparams.num_epochs)
    
    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(self.train_ds, collate_fn=self.train_ds.collate_fn,
                                             batch_size=self.hparams.batch_size,
                                             num_workers=4, shuffle=True)
        return loader
     
    def val_dataloader(self):
        loader = torch.utils.data.DataLoader(self.val_ds, collate_fn=self.train_ds.collate_fn,
                                             batch_size=self.hparams.batch_size,
                                             num_workers=4)      
        return loader
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        preds = self(x)
        loss = self.loss_fn(preds, y)
        
        logs = {'train_loss': loss}
        
        return {'loss': logs['train_loss'], 'log': logs}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()

        logs = {'train_epoch_loss': avg_loss}
        return {'log': logs}
    
    def validation_step(self, batch, batch_nb):
        x, y = batch
        
        preds = self(x)
        loss = self.loss_fn(preds, y)
        
        logs = {'val_loss': loss}
        self.logger.experiment.log(logs)
        
        return {'val_loss': logs['val_loss'], 'labels': y, 'preds': preds}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()

        label = torch.cat([x['labels'] for x in outputs], dim=0)
        preds = torch.cat([x['preds'] for x in outputs], dim=0)
        label = np.eye(self.hparams.num_classes, dtype=np.int)[label.cpu().numpy()]
        try:
          auc = torch.tensor(self.auc(label, self.soft(preds).cpu(), average='macro', multi_class='ovo'))
        except ValueError:
          auc = torch.tensor(0)

        logs = {'val_epoch_loss': avg_loss, 'val_epoch_auc': auc}
        return {'val_epoch_auc': logs['val_epoch_auc'], 'log': logs}
    
    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]
        optimizer = AdamW(optimizer_parameters, lr=self.hparams.lr)
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=0, 
                                                    num_training_steps=self.num_train_steps)
        
        return [optimizer], [scheduler]

    def predict(self, texts, batch_size=32):
        ds = DatasetClass(features=texts, target=np.zeros(len(texts)))
        loader = torch.utils.data.DataLoader(ds, collate_fn=ds.collate_fn, batch_size=batch_size, num_workers=4)

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.to(device)
        preds = []
        with torch.no_grad():
            for batch in tqdm(loader, total=len(loader), desc=f"Predicting on {device}"):
                #move to proper device
                features = batch[0].to(device)

                preds.append(self.soft(self(features)))
            
            preds = torch.cat(preds, dim=0)
        return preds

In [55]:
workdir = ''
if workdir is not '':
    workdir += '/'
with open(workdir + 'data/label_weights.pkl', 'rb') as f:
    class_weights = pickle.load(f)
config = get_config(workdir, num_classes=len(class_weights), weigths=list(class_weights.values()))


In [56]:
config

Namespace(batch_size=16, dropout_prob=0.1, lr=3e-05, num_classes=79, num_epochs=5, options_file='models/ELMo/options.json', train_file_path='data/train.parquet', val_file_path='data/val.parquet', weight_file='models/ELMo/model.hdf5', weigths=tensor([0.0729, 0.0486, 0.0148, 0.4136, 0.0020, 0.2257, 0.1276, 0.1310, 0.0147,
        0.3990, 0.0216, 0.1125, 0.0012, 0.3062, 0.0403, 0.0272, 0.1616, 0.3930,
        0.0879, 0.0760, 0.0148, 0.3780, 0.0698, 0.0195, 0.0024, 0.1528, 0.0141,
        0.0601, 0.2734, 0.0551, 0.0408, 0.0119, 0.5603, 0.1197, 0.0234, 0.0055,
        0.0565, 0.0098, 0.1319, 0.6991, 0.0038, 0.0290, 0.3147, 0.7670, 0.0534,
        0.0963, 0.0435, 0.0139, 0.0038, 0.0533, 0.0140, 0.5338, 0.1626, 0.0047,
        0.0502, 0.5852, 0.4463, 0.0985, 0.0124, 0.4202, 0.3198, 0.6639, 0.2005,
        0.0618, 0.0603, 0.0251, 0.0730, 0.6220, 0.0916, 0.0727, 0.0028, 0.0755,
        0.0470, 0.2651, 0.0898, 0.0397, 0.0317, 0.0785, 0.5448]))

In [59]:
project_name='ODS_QA'
run_name='test'

In [61]:
import numpy as np
from tqdm import tqdm
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import lr_scheduler

import pytorch_lightning as pl
from allennlp.modules.elmo import Elmo, batch_to_ids
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import roc_auc_score

In [78]:
model = ModelClass(config)

wandb_logger = WandbLogger(project=project_name, name=run_name)
stop_patience = 2
early_stopping = EarlyStopping(monitor='val_epoch_auc',
                            min_delta=0,
                            patience=stop_patience,
                            verbose=True,
                            mode='max')
checkpoint_callback = ModelCheckpoint(filepath=wandb_logger.experiment.dir+'/'+run_name+'_{epoch}-{val_epoch_auc:.2f}', 
                                    save_top_k=1, verbose= True,
                                    monitor='val_epoch_auc', mode='max')
trainer = Trainer(gpus=1, early_stopping_callback=early_stopping, checkpoint_callback=checkpoint_callback, logger=wandb_logger, 
                    max_epochs=config.num_epochs, auto_scale_batch_size='binsearch')
trainer.fit(model)
for file in os.listdir(wandb_logger.experiment.dir):
    if file.endswith(".ckpt"):
        ckpt_path = os.path.join(wandb_logger.experiment.dir, file)
model.load_from_checkpoint(ckpt_path)
return model

FileNotFoundError: [Errno 2] No such file or directory: '/home/satellite/Documents/python_workdir/netology_learning/deep-nlp-spring-2020/ODS-QA/wandb/run-20200605_162214-2avyajhq/wandb-history.jsonl'

In [77]:
import gc
for obj in gc.get_objects():
    if torch.is_tensor(obj):
        del obj
gc.collect()
torch.cuda.empty_cache()