## Import libraries

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Import data

In [None]:
INPUT_FILE = '../../data/SB_publication_PMC_texts.parquet'
OUTPUT_FILE = '../../data/SB_publication_PMC_texts_simplified.parquet'

In [None]:
df_texts = pd.read_parquet(INPUT_FILE)
df_texts.head()

# MODEL

## Load model and instruction

#### Model: haining/scientific_abstract_simplification

In [None]:
INSTRUCTION = "summarize, simplify, and contextualize: "
tokenizer = AutoTokenizer.from_pretrained("haining/scientific_abstract_simplification")
model = AutoModelForSeq2SeqLM.from_pretrained("haining/scientific_abstract_simplification")

## Apply model and save file

In [None]:
def simplify_text_safe(text):
    """
    Simplifies text using a pre-trained model if:
      - it is not NaN
      - minimum length of 300 characters (after stripping whitespace)
    If not, returns an empty string.
    """
    if not isinstance(text, str):
        return ""
    if len(text.strip()) < 300:
        return ""

    try:
        encoding = tokenizer(INSTRUCTION + text,
                             max_length=672,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')
        decoded_ids = model.generate(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask'],
            max_length=512,
            top_p=0.9,
            do_sample=True
        )
        return tokenizer.decode(decoded_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"⚠️ Error simplifying text: {e}")
        return ""

In [None]:
df_texts['simplified_abstract'] = df_texts['abstract'].progress_apply(simplify_text_safe)
df_texts.to_parquet(OUTPUT_FILE, index=False)