<a href="https://colab.research.google.com/github/sathu0622/25-26J-438-AI-Powered-LMS-for-Visually-Impaired-Students/blob/AI-Powered-System-for-Voice-Based-Resource-Type-Summarization-of-Historical-Content-for-VIS/Summarization2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===================== 2Ô∏è‚É£ Mount Google Drive =====================
from google.colab import drive
drive.mount('/content/drive')

# ===================== 3Ô∏è‚É£ Load Dataset =====================
import json
from datasets import Dataset

dataset_path = "/content/drive/MyDrive/history_dataset.json"  # update path

with open(dataset_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

dataset = Dataset.from_list([{
    'text': item['content'],
    'summary': item['target_summary'],
    'source_type': item['source_type']
} for item in data])

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
test_dataset = dataset['test']

print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")

# ===================== 4Ô∏è‚É£ Tokenization =====================
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
max_input_length = 1024
max_target_length_map = {'newspaper': 120, 'magazine': 220, 'book': 400}

def preprocess(batch):
    max_target_lengths = [max_target_length_map.get(src, 150) for src in batch['source_type']]

    inputs = tokenizer(batch['text'], max_length=max_input_length, truncation=True, padding="max_length")
    targets = [tokenizer(batch['summary'][i], max_length=max_target_lengths[i], truncation=True, padding="max_length")
               for i in range(len(batch['summary']))]

    batch["input_ids"] = inputs["input_ids"]
    batch["attention_mask"] = inputs["attention_mask"]
    batch["labels"] = [t['input_ids'] for t in targets]
    return batch

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

# ===================== 5Ô∏è‚É£ Load Model =====================
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# ===================== 6Ô∏è‚É£ Check GPU =====================
import torch
use_fp16 = torch.cuda.is_available()
print(f"GPU available: {use_fp16}")

# ===================== 7Ô∏è‚É£ Training Arguments =====================
from transformers import TrainingArguments

output_dir = "/content/drive/MyDrive/history_summary_model"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=use_fp16
)

# ===================== 8Ô∏è‚É£ Data Collator =====================
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ===================== 9Ô∏è‚É£ Trainer =====================
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ===================== üîü Train =====================
trainer.train()

# ===================== 1Ô∏è‚É£1Ô∏è‚É£ Save Final Model =====================
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# ===================== 1Ô∏è‚É£2Ô∏è‚É£ Generate Summaries =====================
def summarize(text, source_type='magazine'):
    max_length = max_target_length_map.get(source_type, 150)
    inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=int(max_length*0.5),
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
example_text = train_dataset[0]['text']
example_type = train_dataset[0]['source_type']
print("Generated Summary:")
print(summarize(example_text, source_type=example_type))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train samples: 360, Test samples: 41


Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

GPU available: True


  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msathushan622[0m ([33msathushan622-sliit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,3.8987
20,2.5498
30,2.3494
40,2.0854
50,1.7391
60,1.8411
70,1.9152
80,1.9258
90,1.9748
100,1.945




Model saved to /content/drive/MyDrive/history_summary_model
Generated Summary:


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)

In [None]:
# ===================== 0Ô∏è‚É£ Install Dependencies =====================
!pip install -q evaluate rouge_score transformers

# ===================== 1Ô∏è‚É£ Load the Saved Model =====================
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

model_path = "/content/drive/MyDrive/history_summary_model"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print("Model loaded. Using device:", device)

# ===================== 2Ô∏è‚É£ Import Evaluation Library =====================
import evaluate
rouge = evaluate.load("rouge")

# ===================== 3Ô∏è‚É£ Define Summarization Function =====================
max_target_length_map = {'newspaper': 120, 'magazine': 220, 'book': 400}

def summarize(text, source_type='magazine'):
    max_length = max_target_length_map.get(source_type, 150)
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=int(max_length*0.5),
        length_penalty=2.0,
        num_beams=4,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ===================== 4Ô∏è‚É£ Generate Summaries on Test Dataset =====================
generated_summaries = []
reference_summaries = []

for example in test_dataset:
    pred = summarize(example['text'], example['source_type'])
    generated_summaries.append(pred)
    reference_summaries.append(example['summary'])

# ===================== 5Ô∏è‚É£ Print Some Examples =====================
for i in range(5):
    print(f"--- Example {i+1} ---")
    print("Source Type:", test_dataset[i]['source_type'])
    print("Original Text:", test_dataset[i]['text'][:500], "...")
    print("Target Summary:", reference_summaries[i])
    print("Generated Summary:", generated_summaries[i])
    print("\n")

# ===================== 6Ô∏è‚É£ Calculate ROUGE Scores =====================
results = rouge.compute(predictions=generated_summaries, references=reference_summaries)
print("ROUGE Scores:")
for key in results:
    print(f"{key}: {results[key].mid.fmeasure:.4f}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
Model loaded. Using device: cuda




--- Example 1 ---
Source Type: magazine
Original Text: The Seventh Dragoon Guards made one of the last charges of the First World War at 10.30am on 11 November 1918, galloping forward and capturing the bridge at Lessines in the Picardy region of Belgium. Fighting officially ended at 11am on the same day, when the Armistice was signed. On hearing the news, the Fifth Dragoon Guards  official war diary described the celebration as the regiment prepared for a  triumphal march into Germany . While the official diary could easily be written in a celebrato ...
Target Summary: The First World War‚Äôs thunderous end arrived with a stark paradox on November 11, 1918. Even as the official Armistice was hours away, the Seventh Dragoon Guards made one of the conflict's last charges, capturing a bridge at Lessines. While official war diaries optimistically spoke of triumphal marches into Germany and newspapers at home erupted in jubilant celebration, the news of peace was often met with a far more c

AttributeError: 'numpy.float64' object has no attribute 'mid'