In [1]:
import sys
sys.path.insert(0, "../src")

import re
import os
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
from utils import log

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)


In [2]:
with open("../data/prepared.pkl", "rb") as fp:
    prepared = pkl.load(fp)
vocabulary = prepared["vocabulary"]
texts = prepared["texts"]
contexts = prepared["contexts"]
test_texts = prepared["test_texts"]
y_train = prepared["y_train"]
y_test = prepared["y_test"]


In [3]:
text_train = prepared["texts_train"]
text_test = prepared["texts_test"]


# CBoW

In [None]:
import torch as th
from types import NoneType
from razdel import tokenize
from typing import Union, Mapping, Any
from pytorch_lightning import LightningModule


class CBOWModel(LightningModule):
    def __init__(self, vocab_size=71186, embedding_dim=128):
        super().__init__()
        self.embeddings = th.nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = th.nn.Linear(embedding_dim, vocab_size)
        self.loss = th.nn.CrossEntropyLoss()
        self.train_outputs = []
        self.val_outputs = []
        self.test_outputs = []
    
    def forward(self, centrals, contexts):
        projections = self.embeddings.forward(contexts).sum(axis=1)
        logits = self.out_layer.forward(projections)
        loss = self.loss(logits, centrals.squeeze())
        return loss
    
    def training_step(self, batch, batch_nb):
        result = self(*batch)
        self.log("loss", result)
        return {'loss': result}
    
    def validation_step(self, batch, batch_nb):
        result = self(*batch)
        self.log("val_loss", result)  
        return {'val_loss': result}

    def test_step(self, batch, batch_nb):
        result = self(*batch)
        self.log("test_loss", result)
        return {'test_loss': self(*batch)}

    def on_train_batch_end(
        self,
        outputs: Union[th.Tensor, Mapping[str, Any], NoneType],
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self.train_outputs.append(outputs)
    
    def on_train_epoch_end(self):
        outputs = self.train_outputs
        avg_loss = th.stack([x['loss'] for x in outputs]).mean()
        tensorboard_logs = {'loss': avg_loss}
        self.log("train_loss_epoch", avg_loss, on_step=False, on_epoch=True)
        return {'train_loss_epoch': avg_loss, 'progress_bar': tensorboard_logs}
    
    def on_validation_batch_end(
        self,
        outputs: Union[th.Tensor, Mapping[str, Any], NoneType],
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self.val_outputs.append(outputs)
    
    def on_validation_epoch_end(self):
        outputs = self.val_outputs
        avg_loss = th.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        self.log("val_loss_epoch", avg_loss, on_step=False, on_epoch=True)
        return {'val_loss_epoch': avg_loss, 'progress_bar': tensorboard_logs}

    def on_test_batch_end(
        self,
        outputs: Union[th.Tensor, Mapping[str, Any], NoneType],
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self.test_outputs.append(outputs)
    
    def on_test_epoch_end(self):
        outputs = self.test_outputs 
        avg_loss = th.stack([x['test_loss'] for x in outputs]).mean()
        tensorboard_logs = {'test_loss': avg_loss}
        self.log("test_loss_epoch", avg_loss, on_step=False, on_epoch=True)
        return {'test_loss_epoch': avg_loss, 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        optimizer = th.optim.Adam(self.parameters(), lr=1e-4)
        return [optimizer]


In [None]:
def get_emb_by_text(embeddings, vocabulary, phrase):
    embeddings = np.array(
        [
            embeddings[vocabulary.get_index(word.text.lower())] 
            for word in tokenize(phrase)
        ]
    )
    return np.mean(embeddings, axis=0)


def get_emb_by_tokens(embeddings, vocabulary, tokens):
    vectors = []
    for tok in tokens:
        idx = vocabulary.get_index(tok)
        vec = embeddings[idx,:] 
        vectors.append(vec)
    return np.mean(np.array(vectors), axis=0)


In [None]:
PATH_MODEL = "ckpt/w06-epoch=9-val_loss=-98685.14.ckpt"
model = CBOWModel.load_from_checkpoint(PATH_MODEL)
embeddings = model.embeddings.weight.cpu().data.numpy()


In [None]:
X_train = np.zeros((len(text_train), embeddings.shape[1]))
for i, text in enumerate(text_train):
    X_train[i, :] = get_emb_by_tokens(embeddings, vocabulary, text)

X_test = np.zeros((len(text_test), embeddings.shape[1]))
for i, text in enumerate(text_test):
    X_test[i, :] = get_emb_by_text(embeddings, vocabulary, text)

print(X_train.shape)
print(y_train.shape)


In [None]:
print(y_train.shape)

In [None]:
import lightgbm as lgb

In [None]:
est = lgb.LGBMClassifier(
    n_estimators=500,
    subsample=0.6,
    max_depth=2,
    min_child_samples=1000
)

In [None]:
est.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
p_train = est.predict_proba(X_train)
roc_auc_score(y_true=y_train, y_score=p_train, multi_class="ovo")

In [None]:
p_test = est.predict_proba(X_test)
roc_auc_score(y_true=y_test, y_score=p_test, multi_class="ovo")

# CBoW + NS

# XLM-RoBerta

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base").eval()

# prepare input
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
x_list = []
for i, text in enumerate(text_train):
    tokens = tokenizer(text,return_tensors='pt', max_length=512)
    out = model(**tokens, output_hidden_states=True)
    X_train[i, :] = out.hidden_states[-1].squeeze().mean(0).detach()
    if i % 100 == 0:
        log(f"{i=}")



In [None]:
# from itertools import batched

In [5]:
from itertools import islice

def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


In [8]:
batches = batched(text_train, 128)

In [9]:
n_batches = len([_ for b in batches])
n_batches


495

In [None]:
outputs = []
batches = batched(text_train, 128)
for i, batch in enumerate(batches):
    tokens = tokenizer(
        list(batch),
        return_tensors='pt', 
        max_length=512, 
        truncation=True,
        padding=True
    )    
    out = model(**tokens, output_hidden_states=True)
    out_batch = out.hidden_states[-1].squeeze().mean(0).detach().numpy()
    outputs.append(out_batch)
    del batch, tokens, out, out_batch
    # if i % 100 == 0:
    log(f"{i=}")


[2024-04-07 02:32:38.540252][jupyter] i=0
[2024-04-07 02:33:10.643498][jupyter] i=1
[2024-04-07 02:33:41.743247][jupyter] i=2
[2024-04-07 02:34:12.703552][jupyter] i=3
[2024-04-07 02:34:43.927627][jupyter] i=4
[2024-04-07 02:35:14.883332][jupyter] i=5
[2024-04-07 02:35:45.899380][jupyter] i=6
[2024-04-07 02:36:17.016473][jupyter] i=7
[2024-04-07 02:36:47.931583][jupyter] i=8
[2024-04-07 02:37:18.934758][jupyter] i=9
[2024-04-07 02:37:50.027331][jupyter] i=10
[2024-04-07 02:38:22.188971][jupyter] i=11
[2024-04-07 02:38:53.372067][jupyter] i=12


[2024-04-07 02:06:20.448628][jupyter] i=0
[2024-04-07 02:06:35.924847][jupyter] i=1
[2024-04-07 02:06:51.723109][jupyter] i=2
[2024-04-07 02:07:07.197572][jupyter] i=3
[2024-04-07 02:07:22.617140][jupyter] i=4
[2024-04-07 02:07:38.066255][jupyter] i=5
[2024-04-07 02:07:53.547985][jupyter] i=6
[2024-04-07 02:08:08.893030][jupyter] i=7
[2024-04-07 02:08:24.327267][jupyter] i=8
[2024-04-07 02:08:39.694576][jupyter] i=9
[2024-04-07 02:08:55.101007][jupyter] i=10
[2024-04-07 02:09:10.433617][jupyter] i=11
[2024-04-07 02:09:25.801040][jupyter] i=12
[2024-04-07 02:09:41.433941][jupyter] i=13
[2024-04-07 02:09:56.692024][jupyter] i=14
[2024-04-07 02:10:11.953853][jupyter] i=15
[2024-04-07 02:10:27.398088][jupyter] i=16
[2024-04-07 02:10:42.791378][jupyter] i=17
[2024-04-07 02:10:58.184608][jupyter] i=18
[2024-04-07 02:11:13.675452][jupyter] i=19
[2024-04-07 02:11:28.993994][jupyter] i=20

In [12]:
# del batch, tokens, out, out_batch 

NameError: name 'batch' is not defined

In [None]:
X_test = np.zeros((len(text_test), 768))
for i, text in enumerate(text_test):
    tokens = tokenizer(batch,return_tensors='pt', max_length=512)
    out = model(**tokens, output_hidden_states=True)
    X_test[i, :] = out.hidden_states[-1].mean(1).detach().numpy()
    if i % 100 == 0:
        log(f"{i=}")


In [None]:
out.hidden_states[-1].shape

In [None]:
p_train = est.predict_proba(X_train)
roc_auc_score(y_true=y_train, y_score=p_train, multi_class="ovo")

In [None]:
p_test = est.predict_proba(X_test)
roc_auc_score(y_true=y_test, y_score=p_test, multi_class="ovo")