In [1]:
import sys
sys.path.insert(0, "../src")

import re
import os
import datetime
import numpy as np
import torch as th
import pandas as pd
import pickle as pkl
from utils import log
from sklearn.metrics import roc_auc_score

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../data/prepared.pkl", "rb") as fp:
    prepared = pkl.load(fp)
vocabulary = prepared["vocabulary"]
texts = prepared["texts"]
contexts = prepared["contexts"]
test_texts = prepared["test_texts"]
y_train = prepared["y_train"]
y_test = prepared["y_test"]
text_train = prepared["texts_train"]
text_test = prepared["texts_test"]


# XLM-RoBerta

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base").eval().cuda()


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from itertools import islice

def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


In [11]:
batch = [
    "Метод биологической очистки основан на способности некоторых видов микроорганизмов в определённых условиях использовать загрязняющие вещества в качестве своего питания. Множество микроорганизмов, составляющих активный ил биологического очистного сооружения, находясь в сточной жидкости, поглощает загрязняющие вещества внутрь клетки, где они под воздействием ферментов подвергаются биохимическим превращениям. При этом органические и некоторые виды неорганических загрязняющих веществ используются бактериальной клеткой в двух направлениях:"*10,
    "Активный ил — биоценоз зоогенных скоплений (колоний) бактерий и простейших организмов, которые участвуют в очистке сточных вод. Применяется в биологической очистке сточных вод. Данный метод был изобретён в Великобритании в 1913 году. Биологическая очистка сточных вод осуществляется с целью удаления из них органических веществ, в том числе соединений азота и фосфора."
]

In [12]:

tokens = tokenizer(
    list(batch),
    return_tensors='pt', 
    max_length=512, 
    truncation=True,
    padding=True,
    pad_to_max_length=True
)   

In [13]:
tokens["input_ids"].shape

torch.Size([2, 512])

In [15]:
n = 0
batches = batched(text_train, 16)
for i, batch in enumerate(batches):
    n += len(batch)

In [16]:
n

63356

In [None]:
batches = batched(text_train, 16)
n_batches = len([_ for b in batches])
outputs = []
batches = batched(text_train, 16)
log(f"starting loop over {n_batches=}")
for i, batch in enumerate(batches):
    tokens = tokenizer(
        batch,
        return_tensors='pt', 
        max_length=512, 
        truncation=True,
        padding=True,
        pad_to_max_length=True
    )    
    input_ids=tokens["input_ids"].cuda()
    attention_mask=tokens["attention_mask"].cuda()
    log("")
    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=True
    )
    out_batch = out.hidden_states[-1].squeeze().mean(0).detach().cpu().numpy()
    outputs.append(out_batch)
    del batch, tokens,input_ids, attention_mask, out, out_batch
    log(f"{i=}")
output = np.vstack(outputs)
path = "../data/xmlr_embs"
log(f"saving data to \"{path}\"")
np.save(, output)
log("END")


In [None]:
batches = batched(text_test, 16)
n_batches = len([_ for b in batches])
outputs = []
batches = batched(text_test, 16)
log(f"starting loop over {n_batches=}")
for i, batch in enumerate(batches):
    tokens = tokenizer(
        list(batch),
        return_tensors='pt', 
        max_length=512, 
        truncation=True,
        padding=True,
        pad_to_max_length=True
    )    
    input_ids=tokens["input_ids"].cuda()
    attention_mask=tokens["attention_mask"].cuda()
    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=True
    )
    out_batch = out.hidden_states[-1].squeeze().mean(0).detach().cpu().numpy()
    outputs.append(out_batch)
    del batch, tokens,input_ids, attention_mask, out, out_batch
    log(f"{i=}")
output = np.vstack(outputs)
path = "../data/xmlr_embs_test"
log(f"saving data to \"{path}\"")
np.save(path, output)
log("END")


In [None]:
x_train = 

In [None]:
del batch, tokens, input_ids, attention_mask, out, out_batch

In [None]:
output.shape

In [None]:
txt = text_train[31339]

In [None]:
tok = tokenizer(txt, return_tensors='pt')

In [None]:
input_ids=tokens["input_ids"].cuda()
attention_mask=tokens["input_ids"].cuda()
o = model(tok)

In [None]:
X_test = np.zeros((len(text_test), 768))
for i, text in enumerate(text_test):
    tokens = tokenizer(batch,return_tensors='pt', max_length=512)
    out = model(**tokens, output_hidden_states=True)
    X_test[i, :] = out.hidden_states[-1].mean(1).detach().numpy()
    if i % 100 == 0:
        log(f"{i=}")


In [None]:
out.hidden_states[-1].shape

In [2]:
p_train = est.predict_proba(X_train)
roc_auc_score(y_true=y_train, y_score=p_train, multi_class="ovo")

NameError: name 'est' is not defined

In [None]:
p_test = est.predict_proba(X_test)
roc_auc_score(y_true=y_test, y_score=p_test, multi_class="ovo")