# Using Flan T5 to determine how accurate different parts of articles can predict keywords

In [125]:
import sys
!{sys.executable} -m pip install torch torchvision torchaudio datasets scikit-learn transformers rapidfuzz --quiet
import json
import os
import torch
import re
from rapidfuzz import fuzz
from datasets import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


distutils: /home/rpuranda/.local/lib/python3.9/site-packages
sysconfig: /home/rpuranda/.local/lib64/python3.9/site-packages[0m
user = True
home = None
root = None
prefix = None[0m


In [126]:
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, "r") as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    data.append(entry)
                except json.JSONDecodeError as e:
                    print(f"Skipping bad line in {file_path}: {e}")
    return data

In [127]:
def prepare_dataset(data, input_field):
    return [
        {
            "input": f"Extract keywords: {item[input_field]}",
            "target": ", ".join(item["keywords"])
        }
        for item in data
        if input_field in item and "keywords" in item and isinstance(item["keywords"], list)
    ]

In [128]:
def tokenize_data(dataset, tokenizer, max_input_length=512, max_target_length=32):
    inputs = [item["input"] for item in dataset]
    targets = [item["target"] for item in dataset]

    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return Dataset.from_dict(model_inputs)

In [129]:
def train_model(model_name, dataset, output_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    tokenized_dataset = tokenize_data(dataset, tokenizer)

    args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="no",
        save_strategy="no",
        remove_unused_columns=False,
        logging_steps=10,
        logging_dir="./logs"
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    return model, tokenizer

In [130]:
def get_file_paths(prefix, start, end):
    return [f"{prefix}{i:04d}" for i in range(start, end + 1)]

In [131]:
def clean_keyword(kw):
    return re.sub(r'\W+', '', kw.lower())

In [132]:
def is_fuzzy_match(word, candidates, threshold=80):
    return any(fuzz.ratio(word, candidate) >= threshold for candidate in candidates)

In [133]:
def compute_metrics(preds, refs):

    precision_list = []
    recall_list = []
    f1_list = []

    for pred, ref in zip(preds, refs):
        pred = set(clean_keyword(k) for k in pred)
        ref = set(clean_keyword(k) for k in ref)
        true_positives = sum(
            1 for p in pred if is_fuzzy_match(p, ref)
        )
        true_negatives = sum(
            1 for p in pred if not is_fuzzy_match(p, ref)
        )
        precision = true_positives / len(pred) if pred else 0
        recall = true_positives / len(ref) if ref else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0

        print("precision: ", precision, " recall: ", recall, " f1: ", f1, " accuracy: ", (true_positives + true_negatives)/len(preds))

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    return {
        "precision": sum(precision_list) / len(precision_list),
        "recall": sum(recall_list) / len(recall_list),
        "f1": sum(f1_list) / len(f1_list),
    }

In [134]:
def evaluate_order_agnostic(model, tokenizer, dataset):
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    predictions = []
    references = []

    for sample in dataset:
        inputs = tokenizer(sample["input"], return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        input_ids = inputs["input_ids"].to(device) 

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, max_length=64)

        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref = sample["target"]

        pred = set(pred.split(", "))
        ref = set(ref.split(", "))

        predictions.append(pred)
        references.append(ref)

        pred = set(clean_keyword(k) for k in pred)
        ref = set(clean_keyword(k) for k in ref)
        true_positives = len(pred & ref)
        precision = true_positives / len(pred) if pred else 0
        recall = true_positives / len(ref) if ref else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0

        print("precision: ", precision, " recall: ", recall, " f1: ", f1)
        # print("Predictions: ", set(pred.split(", ")))
        # print("References: ", set(ref.split(", ")))

    return compute_metrics(predictions, references)

In [135]:
model_name = "google/flan-t5-small"
train_files = get_file_paths("./data/training-data-chunk-", 0, 7)
test_files = get_file_paths("./data/training-data-chunk-", 8, 9)

print("Loading data...")
train_data = load_data(train_files)
test_data = load_data(test_files)

Loading data...


In [136]:
print("Preparing abstract...")
train_abstract = prepare_dataset(train_data, "abstract_content")
print(f"Training with {len(train_abstract)} abstract samples")
test_abstract = prepare_dataset(test_data, "abstract_content")
print(f"Testing with {len(test_abstract)} abstract samples")

Preparing abstract...
Training with 8000 abstract samples
Testing with 2000 abstract samples


In [137]:
print("Preparing body...")
train_body = prepare_dataset(train_data, "content")
print(f"Training with {len(train_body)} body samples")
test_body = prepare_dataset(test_data, "content")
print(f"Testing with {len(test_body)} body samples")

Preparing body...
Training with 8000 body samples
Testing with 2000 body samples


In [138]:
print("Training abstract model...")
abstract_model, abstract_tokenizer = train_model(model_name, train_abstract, "flan_t5_abstract")

Training abstract model...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,13.1552
20,11.9241
30,12.3198
40,12.0127
50,11.1469
60,10.6371
70,9.3363
80,8.5878
90,8.6469
100,8.0695


KeyboardInterrupt: 

In [None]:
print("Training body model...")
body_model, body_tokenizer = train_model(model_name, train_body, "flan_t5_body")

Training body model...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(


Step,Training Loss
10,13.1293
20,12.367
30,12.5617
40,12.1929
50,11.2085
60,11.1093
70,9.7662
80,8.8503
90,9.3241
100,8.4932


In [None]:
print("Evaluating abstract-only model...")
abstract_metrics = evaluate_order_agnostic(abstract_model, abstract_tokenizer, test_abstract)

Evaluating abstract-only model...
precision:  0.0  recall:  0.0  f1:  0
precision:  0.3333333333333333  recall:  0.125  f1:  0.18181818181818182
precision:  0.3333333333333333  recall:  0.2  f1:  0.25
precision:  0.0  recall:  0.0  f1:  0
precision:  0.25  recall:  0.25  f1:  0.25
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.3333333333333333  recall:  0.25  f1:  0.28571428571428575
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.25  recall:  0.2  f1:  0.22222222222222224
precision:  0.0  recall:  0.0  f1:  0
precision:  0.6666666666666666  recall:  0.3333333333333333  f1:  0.4444444444444444
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.3333333333333333  recall:  0.16666666666666666  f1:  0.2222222222222222
precision:  0.25  recall:  0.166666666

In [None]:
print("Evaluating body-only model...")
body_metrics = evaluate_order_agnostic(body_model, body_tokenizer, test_body)

Evaluating body-only model...
precision:  0.0  recall:  0.0  f1:  0
precision:  0.25  recall:  0.0625  f1:  0.1
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.5  recall:  0.25  f1:  0.3333333333333333
precision:  0.0  recall:  0.0  f1:  0
precision:  0.3333333333333333  recall:  0.06666666666666667  f1:  0.1111111111111111
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.0  recall:  0.0  f1:  0
precision:  0.3333333333333333  recall:  0.2  f1:  0.25
precis

In [None]:
print(abstract_metrics)
print(body_metrics)

{'precision': 0.20252023809523773, 'recall': 0.13490493306611787, 'f1': 0.1520973122614194}
{'precision': 0.1746428571428569, 'recall': 0.09780217426878793, 'f1': 0.11708462430128175}
