# Test a RoBERTa model on a hold-out dataset

In [1]:
import sys

sys.path.append("..")

In [2]:
import sqlite3
from pathlib import Path

import pandas as pd
from sklearn import metrics
from torch.utils.data import Dataset
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    TextClassificationPipeline,
    pipeline,
)
from tqdm import tqdm

from adna.pylib import consts
from adna.pylib.datasets import ADnaDataset

## Build the tokenizer

In [3]:
tokenizer_path = str(consts.SUB_DIR)
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

## Build the dataset

In [4]:
sql = "select seq, label, rev from seqs where split = 'test'"
with sqlite3.connect(consts.SQL) as cxn:
    RECS = list(cxn.execute(sql))

## Get the trained model

In [5]:
path = consts.SUB_DIR / 'models' / 'checkpoint-11225'
model = RobertaForSequenceClassification.from_pretrained(path, local_files_only=True)

## Build the inference pipeline

In [6]:
pipe = TextClassificationPipeline(
    model=model, tokenizer=tokenizer  # , return_all_scores=True
)

In [7]:
pipe(RECS[0][0])

[{'label': 'LABEL_0', 'score': 0.9969417452812195}]

## Test the model

In [8]:
y_true, y_pred = [], []
for rec in tqdm(RECS[:100_000]):
    y_true.append(rec[1])
    pred = pipe(rec[0])[0]['label']
    pred = int(pred[-1])
    y_pred.append(pred)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 95745/95745 [28:27<00:00, 56.06it/s]


In [9]:
y_actual = pd.Series(y_true, name='Actual')
y_predicted = pd.Series(y_pred, name='Predicted')

print(pd.crosstab(y_actual, y_predicted))

Predicted      0      1
Actual                 
0          76037   4941
1            862  13905


In [10]:
metrics.precision_score(y_true, y_pred)

0.7378223495702005

In [11]:
metrics.recall_score(y_true, y_pred)

0.9416265998510192

In [12]:
metrics.f1_score(y_true, y_pred)

0.8273584624996281

In [13]:
metrics.accuracy_score(y_true, y_pred)

0.9393910909185859