## Imports

In [None]:
import os
import torch
import datasets

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DataCollatorWithPadding,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    pipeline,
)

from utils import merge_title_perex_body

## Define constants

In [None]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(DEVICE)

In [None]:
INPUT_DATA_FILEPATH = "data/100k_prod_articles.csv"
MODEL_DIR = "distilbert_5ep_weighted_CE_loss_w_augmented_data/model"

OUTPUT_PREDICTIONS_FILEPATH = "data/found_sensitive_to_check.csv"
RANDOM_SEED = 11

## Read data

In [None]:
df = pd.read_csv(INPUT_DATA_FILEPATH)

In [None]:
df.shape

In [None]:
df["text"] = df.apply(merge_title_perex_body, axis=1)
df.drop(["title", "perex", "body"], axis=1, inplace=True)

In [None]:
df.head(3)

## Get model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

## Dataset

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, is_test=False):
        self.data = data
        self.tokenizer = tokenizer
        self.text = self.data.text.values.tolist()
        if not is_test:
            self.label = self.data.sensitive.values.tolist()
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        tokenized = tokenizer(self.text[item], truncation=True)
        if not self.is_test:
            return {
                "label": int(self.label[item]),
                "input_ids": tokenized["input_ids"],
                "attention_mask": tokenized["attention_mask"],
            }
        else:
            return {
                "input_ids": tokenized["input_ids"],
                "attention_mask": tokenized["attention_mask"],
            }

In [None]:
test_dataset = DatasetRetriever(df, tokenizer, is_test=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Trainer object

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    seed=RANDOM_SEED,
    overwrite_output_dir="True",
    evaluation_strategy="steps",
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

In [None]:
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# predictions, _, _ = trainer.prediction_loop(test_loader, description="prediction")

## Generate predictions

In [None]:
predictions = trainer.predict(test_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)

In [None]:
list(np.nonzero(predictions)[0])[:10]

In [None]:
df.iloc[list(np.nonzero(predictions)[0])].to_csv(OUTPUT_PREDICTIONS_FILEPATH, index=False)