In [24]:
import os
import json
import torch
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from itertools import islice
from transformers import AutoTokenizer
from transformers import AutoModel
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score, 
    accuracy_score
)


PATH_TRAIN = "../data/train.jsonl"
PATH_TEST = "../data/dev.jsonl"


def log(msg:str, headers=None):
    dttm = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    if (
        ("__main__" in globals() or "__main__" in locals())
        and hasattr(__main__, "__file__")
    ):
        script = os.path.basename(__main__.__file__)
    else:
        script = "jupyter"
    if headers is None:
        headers = []
    header_line = f"[{dttm}][{script}]" + "".join(f"[{h}]" for h in headers)
    print(f"{header_line} {msg}")


def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


def read_data(path):
    records = []
    with open(path) as fp:
        for line in fp:
            record = json.loads(line.strip())
            records.append(record)
    df = pd.DataFrame(records)
    return df


def prepare(df, batch_size=128):
    df["tp"] = df.question + ". " + df.passage
    texts = df.tp.tolist()
    batches = batched(texts, batch_size)
    return batches
    

def make_embs(batches):
    b_vecs = []
    for idx, batch in enumerate(batches):
        if idx%5==0:
            log(f"{idx} batches complete", ["make_embs"])
        tokens = tokenizer(
            batch, 
            return_tensors="pt", 
            padding=True, 
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            vecs = (
                model(**tokens)
                .last_hidden_state
                .mean(1)
                .detach()
                .numpy()
            )
        b_vecs.append(vecs)
    return np.vstack(b_vecs)
    

def get_train_embs():
    return get_embs("train.npy", PATH_TRAIN)


def get_test_embs():
    return get_embs("test.npy", PATH_TEST)


def get_embs(filename, path_in, cache_dir="./"):
    fpath = os.path.join(cache_dir, filename)
    if os.path.exists(fpath):
        with open(fpath, 'rb') as fp:
            embs = np.load(fp)
    else:
        batches = prepare(read_data(path_in))  # 590
        embs = make_embs(batches)
        with open(fpath, 'wb') as fp:
            np.save(fp, embs)
    return embs


def get_scores(y_true, y_score, thr=0.5):
    y_pred = (thr<y_score).astype(int)
    auc = roc_auc_score(y_score=y_score, y_true=y_true)
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    prec = precision_score(y_true=y_true, y_pred=y_pred)
    rec = recall_score(y_true=y_true, y_pred=y_pred)
    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
    return {
        "auroc": auc,
        "f1": f1,
        "precision": prec,
        "recall": rec,
        "accuracy": acc
    }


In [2]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased").eval()


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# считаем ruBert-эмбеддинги или загружаем из кэша
x_train = get_train_embs()
x_test = get_test_embs()
d_train = read_data(PATH_TRAIN)
d_test = read_data(PATH_TEST)
y_train = d_train.answer.astype(int)
y_test = d_test.answer.astype(int)


In [20]:
params = dict(min_child_samples=8) # 0.7706774951912803

est = lgb.LGBMClassifier(**params)
est.fit(x_train, d_train.answer.values)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Number of positive: 5874, number of negative: 3553
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 9427, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623104 -> initscore=0.502744
[LightGBM] [Info] Start training from score 0.502744


In [25]:
get_scores(
    y_true=d_train.answer.values,
    y_score=est.predict_proba(x_train)[:,1]
)


{'auroc': 0.9976606494140341,
 'f1': 0.966922378949105,
 'precision': 0.9379100656104977,
 'recall': 0.9977868573374191,
 'accuracy': 0.9574626074042644}

In [26]:
get_scores(
    y_true=d_test.answer.values,
    y_score=est.predict_proba(x_test)[:,1]
)


{'auroc': 0.669559980610946,
 'f1': 0.7656582871751171,
 'precision': 0.6753100338218715,
 'recall': 0.8839153959665519,
 'accuracy': 0.6636085626911316}

In [10]:
x_train.shape

(9427, 768)

In [11]:
x_test.shape

(3270, 768)

In [None]:
b_train = prepare(read_data(PATH_TRAIN))  # 590
b_test = prepare(read_data(PATH_TEST))  # 205
len([b for b in b_train]), len([b for b in b_test])
# (74, 26)


In [None]:
# embs_train = make_embs(b_train)
# with open('train.npy', 'wb') as fp:
#     np.save(fp, embs_train)

In [None]:
# embs_test = make_embs(b_test)
# with open('test.npy', 'wb') as fp:
#     np.save(fp, embs_test)

In [None]:
x_train = get_train_embs()

In [None]:
590*16

In [None]:
x_train.shape

In [None]:
np.vstack(
    [
        np.array([[1,2],[3,4]]),
        np.array([[1,2],[3,4]])
    ]
).sa

In [None]:
tokens_train["input_ids"].shape

In [None]:
res.last_hidden_state.mean(1).detach().numpy()

In [None]:
tokens_train = tokenizer(batch, return_tensors="pt", padding=True)
res = model(**tokens_train)

# res.last_hidden_state.shape

In [None]:
d_train = read_data(PATH_TRAIN)
d_test = read_data(PATH_TEST)






d_train["tp"] = d_train.question + ". " + d_train.passage
d_test["tp"] = d_test.question + ". " + d_test.passage
# tokens_train = d_train.tp.apply(lambda line: [x for x in gensim.utils.tokenize(line)]).tolist()
# tokens_test = d_test.tp.apply(lambda line: [x for x in gensim.utils.tokenize(line)]).tolist()


In [None]:
texts_train = d_train.tp.tolist()
texts_test = d_test.tp.tolist()


In [None]:
b_train = batched(texts_train, 16)
b_test = batched(texts_test, 16)

In [None]:
for batch in b_train:
    break

In [None]:
tokens_train = tokenizer(batch, return_tensors="pt", padding=True)

In [None]:
tokens_train

In [None]:
tokens_test = tokenizer(d_test.tp.tolist(), return_tensors="pt", padding=True)

In [None]:
tokens_train[0]

In [None]:
model(**tokens_train)