In [1]:
import numpy as np
import pandas as pd
import torch
import random
from datasets import ClassLabel
from transformers import AutoTokenizer
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option('display.max_columns', None)
from datasets import Dataset
from transformers import AutoConfig, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_parquet(f"../data/parquet/dataset_bert.parquet")
labels = dataset['target'].unique().tolist()
lconv = ClassLabel(num_classes=len(labels), names=labels)

train = dataset.sample(frac=0.90, random_state=42).reset_index(drop=True)
test = dataset.drop(train.index).reset_index(drop=True)

In [3]:
model_name = f'../models/BERT/'
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=len(labels))
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
model.to('cuda')
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
test = test.sample(20).reset_index(drop=True)
ds_test = Dataset.from_pandas(test)
ds_test

Dataset({
    features: ['text', 'target'],
    num_rows: 20
})

In [5]:
def batchpredict(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)
    # Forward pass through BERT model
    with torch.no_grad():
        outputs = model(
            input_ids=torch.LongTensor(model_inputs['input_ids']).cuda(),
            attention_mask=torch.LongTensor(model_inputs['attention_mask']).cuda()
        )
    logits = outputs.logits.squeeze().tolist()
    predictions = np.argmax(logits, axis=-1)
    return {'out':predictions}
    
raw_pred = ds_test.map(batchpredict, batched=True, batch_size=512)
test['out'] = raw_pred['out']
#test['target'] = [lconv.int2str(x) for x in test['target']]
test['out'] = [lconv.int2str(x) for x in test['out']]
display(test) 

  0%|          | 0/1 [00:00<?, ?ba/s]

Unnamed: 0,text,target,out
0,"This user lives in Lecco. Trips: walk, train, ...",walk,walk
1,"This user lives in Lecco. Trips: bus, walk, wa...",bus,walk
2,"This user lives in Lecco. Trips: walk, bus, bu...",walk,walk
3,"This user lives in Lecco. Trips: walk, walk, b...",walk,walk
4,"This user lives in Ferrara. Trips: train, bike...",train,walk
5,"This user lives in Ferrara. Trips: bike, bike,...",bike,walk
6,"This user lives in Lecco. Trips: train, train,...",train,walk
7,"This user lives in Ferrara. Trips: walk, bike,...",walk,walk
8,This user lives in Ferrara. Trips: walk.,walk,walk
9,"This user lives in Ferrara. Trips: bike, bike,...",bike,walk
