In [2]:
!pip install transformers datasets rouge-score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# --- Paths ---
INPUT_DIR = "/content/drive/MyDrive/MIXSCI"
TRAIN_CSV = "MixSub-SciHigh_train_FIRE.csv"
VAL_CSV = "MixSub-SciHigh_val_FIRE.csv"
TEST_CSV = "MixSub-SciHigh_test_FIRE.csv"
OUTPUT_FILE = "/content/drive/MyDrive/MIXSCI/lora_test_predictions.csv"

# --- Imports ---
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import get_peft_model, LoraConfig, TaskType
from rouge_score import rouge_scorer
from tqdm import tqdm

# --- Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load Data ---
train_df = pd.read_csv(os.path.join(INPUT_DIR, TRAIN_CSV))
val_df = pd.read_csv(os.path.join(INPUT_DIR, VAL_CSV))
test_df = pd.read_csv(os.path.join(INPUT_DIR, TEST_CSV))

train_ds = Dataset.from_pandas(train_df[["Abstract", "Highlights"]])
val_ds = Dataset.from_pandas(val_df[["Abstract", "Highlights"]])

# --- Model & Tokenizer ---
model_name = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# --- LoRA Config ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(base_model, lora_config)

# --- Preprocessing ---
max_input = 512
max_target = 64

def tokenize(batch):
    inputs = tokenizer(batch["Abstract"], padding="max_length", truncation=True, max_length=max_input)
    targets = tokenizer(batch["Highlights"], padding="max_length", truncation=True, max_length=max_target)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_ds = train_ds.map(tokenize, batched=False)
val_ds = val_ds.map(tokenize, batched=False)

# --- Training Args ---
training_args = Seq2SeqTrainingArguments(
    #output_dir="/kaggle/working/lora_checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    #logging_dir="/kaggle/working/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    report_to="none",
    save_total_limit=1,
    predict_with_generate=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --- Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# --- Train ---
trainer.train()

# --- Evaluation ---
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_target)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\nGenerating validation summaries...")
val_preds = [generate_summary(t) for t in tqdm(val_df["Abstract"])]
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = [scorer.score(ref, pred) for ref, pred in zip(val_df["Highlights"], val_preds)]

def avg(metric): return np.mean([s[metric].fmeasure for s in scores])

print(f"\nROUGE-1: {avg('rouge1'):.4f}")
print(f"ROUGE-2: {avg('rouge2'):.4f}")
print(f"ROUGE-L: {avg('rougeL'):.4f}")

# --- Generate Test Predictions ---
print("\nGenerating test predictions...")
test_df["Generated_Highlights"] = [generate_summary(t) for t in tqdm(test_df["Abstract"])]
test_df[["Filename", "Generated_Highlights"]].to_csv(OUTPUT_FILE, index=False)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1985 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


**Prediction for test set**