In [2]:
from tango import Workspace
workspace = Workspace.from_url("local:///root/workspace/read/temp/training")
model = workspace.step_result_for_run("seed_sent_selection", "train_sent_selection")

In [3]:
model

{'epoch': 2,
 'global_step': 7092,
 'pytorch-lightning_version': '1.7.7',
 'state_dict': OrderedDict([('model.model.shared.weight',
               tensor([[-0.0359,  0.1121,  0.1830,  ...,  0.2042,  0.0570, -0.0751],
                       [ 0.0055, -0.0049, -0.0069,  ..., -0.0030,  0.0038,  0.0087],
                       [-0.0452,  0.4625, -0.0606,  ...,  0.1067,  0.0310,  0.0482],
                       ...,
                       [-0.0138,  0.0278, -0.0467,  ...,  0.0455, -0.0265,  0.0125],
                       [-0.0043,  0.0153, -0.0567,  ...,  0.0496,  0.0108, -0.0099],
                       [ 0.0053,  0.0324, -0.0179,  ..., -0.0085,  0.0223, -0.0020]])),
              ('model.model.encoder.embed_tokens.weight',
               tensor([[-0.0359,  0.1121,  0.1830,  ...,  0.2042,  0.0570, -0.0751],
                       [ 0.0055, -0.0049, -0.0069,  ..., -0.0030,  0.0038,  0.0087],
                       [-0.0452,  0.4625, -0.0606,  ...,  0.1067,  0.0310,  0.0482],
              

In [3]:
import torch
import json
from tango import Step
from tango.common.dataset_dict import DatasetDict
import pandas as pd
from transformers import TapasTokenizer

class TableDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        ex_id = idx % 2
        idx = idx // 2
        item = self.df.iloc[idx]
        table = pd.DataFrame(json.loads(item["table"]))
        cells = zip(*item["highlighted_cells"])
        cells = [list(x) for x in cells]
        sub_table = table.iloc[cells[0], cells[1]].reset_index().astype(str)

        if ex_id == 0:
            encoding = self.tokenizer(
                table=sub_table,
                queries=item["positive"],
                padding="max_length",
                truncation=True,
                max_length=512,
                return_tensors="pt",
            )
            encoding["labels"] = torch.tensor([1])
        else:
            encoding = self.tokenizer(
                table=sub_table,
                queries=item["negative"],
                padding="max_length",
                truncation=True,
                max_length=512,
                return_tensors="pt",
            )
            encoding["labels"] = torch.tensor([0])

        encoding = {key: val[-1] for key, val in encoding.items()}
        return encoding

    def __len__(self):
        return len(self.df)

In [7]:
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base", max_question_length=256)
torch.manual_seed(1)
dev_df = pd.read_json("../temp/seed/sent_selection/data/dev.jsonl", lines=True)

dev_dataset = TableDataset(dev_df, tokenizer)

In [17]:
from torch.utils.data import DataLoader
import accelerate

import evaluate
name2metrics = {
    "accuracy": evaluate.load("accuracy"),
    "precision": evaluate.load("precision"),
    "recall": evaluate.load("recall"),
    "f1": evaluate.load("f1"),
}

dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
accelerator = accelerate.Accelerator()

print(type(model))

model, dataloader = accelerator.prepare(model, dataloader)

for batch in dataloader:
    y_hat = model(**batch)
    preds = y_hat.logits.argmax(dim=1)
    for metric in name2metrics.values():
        metric.add_batch(predictions=preds, references=batch["labels"])



<class 'transformers.models.tapas.modeling_tapas.TapasForSequenceClassification'>


In [18]:
for name, metric in name2metrics.items():
    print(name, metric.compute())

accuracy {'accuracy': 0.9653325817361894}
precision {'precision': 0.97524467472654}
recall {'recall': 0.9549041713641488}
f1 {'f1': 0.9649672457989177}
