In [1]:
import sys
sys.path.insert(0, "../src")

import re
import os
import datetime
import numpy as np
import torch as th
import pandas as pd
import pickle as pkl
from utils import log
from itertools import islice
from transformers import AutoTokenizer, AutoModelForMaskedLM


def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)


def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
log("BEGIN")
log("loading prepared data...")
with open("../data/prepared.pkl", "rb") as fp:
    prepared = pkl.load(fp)
vocabulary = prepared["vocabulary"]
texts = prepared["texts"]
contexts = prepared["contexts"]
test_texts = prepared["test_texts"]
y_train = prepared["y_train"]
y_test = prepared["y_test"]
text_train = prepared["texts_train"]
text_test = prepared["texts_test"]

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base").eval().cuda()
batches = batched(text_train, 16)
n_batches = len([b for b in batches])


[2024-04-07 15:05:58.911348][jupyter] BEGIN
[2024-04-07 15:05:58.911467][jupyter] loading prepared data...


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
outputs = []
batches = batched(text_train, 16)
log(f"starting loop over {n_batches=}", ["train"])
for i, batch in enumerate(batches):
    tokens = tokenizer(
        batch,
        return_tensors='pt', 
        max_length=512, 
        truncation=True,
        padding=True,
        pad_to_max_length=True
    )
    input_ids=tokens["input_ids"].cuda()
    attention_mask=tokens["attention_mask"].cuda()
    log(
        f"{input_ids.shape=} {attention_mask.shape=}", 
        ["train", f"{i}/{n_batches}"]
    )
    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=True
    )
    out_batch = out.hidden_states[-1].mean(1).detach().cpu().numpy()
    outputs.append(out_batch)
    break
    # del batch, tokens,input_ids, attention_mask, out, out_batch
    # log(f"{i=}", ["train", f"{i}/{n_batches}"])
    # if 3 < i:
    #     break
output = np.vstack(outputs)

[2024-04-07 15:09:15.326316][jupyter][train] starting loop over n_batches=3960
[2024-04-07 15:09:15.332760][jupyter][train][0/3960] input_ids.shape=torch.Size([16, 426]) attention_mask.shape=torch.Size([16, 426])


In [10]:
out.hidden_states[-1].shape

torch.Size([16, 426, 768])

In [7]:
output.shape

(426, 768)

In [5]:
[x.shape for x in outputs]

[(426, 768), (437, 768), (512, 768), (512, 768), (512, 768)]

In [None]:
out.