In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option('display.max_columns', None)
from datasets import Dataset, ClassLabel
from transformers import AutoModelForSeq2SeqLM
import warnings
warnings.filterwarnings('ignore')

In [14]:
df = pd.read_parquet('../data/parquet/dataset.parquet')
df['text'] = df['text'].astype(str)
df['target'] = df['target'].astype(str)

labels = df['target'].unique().tolist()
lconv = ClassLabel(num_classes=len(labels), names=labels)
df['target'] = df['target'].astype(str)

train = df.sample(frac=0.90, random_state=42).reset_index(drop=True)
test = df.drop(train.index).reset_index(drop=True)
test = test.sample(1, random_state=42).reset_index(drop=True)

In [3]:
max_input_length = 512
max_target_length = 128

model_name = f'../models/t5'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cuda')
print(model)

In [15]:
ds_test = Dataset.from_pandas(test)
ds_test

Dataset({
    features: ['text', 'target'],
    num_rows: 50
})

In [17]:
def batchpredict(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    beam_outputs = model.generate(
        input_ids=torch.LongTensor(model_inputs['input_ids']).cuda(),
        attention_mask=torch.LongTensor(model_inputs['attention_mask']).cuda(),
        max_length=128,
        early_stopping=True,
        num_beams=1,
        num_return_sequences=1,
    )
    sent = tokenizer.batch_decode(beam_outputs, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return {'out':sent}

raw_pred = ds_test.map(batchpredict, batched=True, batch_size=16)
test['pred'] = raw_pred['out']
test.head(20)

  0%|          | 0/1 [00:00<?, ?ba/s]

Unnamed: 0,text,target,pred
0,City: Ferrara. Trips: (Time: 55164. Vehicle: b...,bike,bike
1,City: Ferrara. Trips: (Time: 54140. Vehicle: w...,bike,bike
2,City: Ferrara. Trips: (Time: 54471. Vehicle: b...,bike,bike
3,City: Ferrara. Trips: (Time: 25468. Vehicle: b...,walk,walk
4,City: Lecco. Trips: (Time: 909. Vehicle: walk)...,bike,walk
5,City: Lecco. Trips: (Time: 22767. Vehicle: wal...,train,walk
6,City: Ferrara. Trips: (Time: 28631. Vehicle: b...,bike,bike
7,"City: Lecco. Trips: (Time: 335. Vehicle: bus),...",walk,bus
8,City: Lecco. Trips: (Time: 22294. Vehicle: wal...,walk,bus
9,City: Ferrara. Trips: (Time: 41687. Vehicle: t...,bike,bike


In [18]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
y_true = raw_pred['target']
y_pred = raw_pred['out']
print(classification_report(y_true, y_pred))

cf = confusion_matrix(y_true, y_pred, labels=labels)
cf = pd.DataFrame(cf, index=labels, columns=labels)
cf

              precision    recall  f1-score   support

        bike       0.89      0.85      0.87        20
         bus       0.50      0.50      0.50         4
       train       0.80      0.67      0.73         6
        walk       0.73      0.80      0.76        20

    accuracy                           0.78        50
   macro avg       0.73      0.70      0.72        50
weighted avg       0.78      0.78      0.78        50



Unnamed: 0,walk,bike,bus,car,train
walk,16,2,2,0,0
bike,2,17,0,0,1
bus,2,0,2,0,0
car,0,0,0,0,0
train,2,0,0,0,4
