## Abstractive Summarization Using Transformers
## Project Code - Successful Implementation
### Sai Srikar Emani

# Environment Setup and Data Loading

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install kaggle



In [None]:
!mkdir -p ~/.kaggle
!cp '/content/drive/MyDrive/Scientific Abstract Summarization/kaggle.json' ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!cp '/content/drive/My Drive/Scientific Abstract Summarization/archive.zip' .

In [None]:
import zipfile

# Extract the dataset
with zipfile.ZipFile("archive.zip", 'r') as zip_ref:
    zip_ref.extractall("arxiv_dataset")

print("Dataset extracted!")

Dataset extracted!


In [None]:
!ls arxiv_dataset

arxiv-metadata-oai-snapshot.json


In [None]:
import json

# Load the JSON file
file_path = "arxiv_dataset/arxiv-metadata-oai-snapshot.json"

# Read and limit the number of records to 50,000
with open(file_path, 'r') as f:
    data = [json.loads(line) for i, line in enumerate(f) if i < 50000]

# Check the number of entries
print(f"Total entries loaded: {len(data)}")

Total entries loaded: 50000


# Data Preprocessing

In [None]:
import pandas as pd

# Extract relevant fields (titles and abstracts)
processed_data = []
for entry in data:
    if 'title' in entry and 'abstract' in entry:  # Ensure both fields exist
        processed_data.append({
            "input_text": entry["abstract"],
            "target_text": entry["title"]
        })

# Convert to DataFrame
df = pd.DataFrame(processed_data)

# Check the processed data
print(f"Total processed entries: {len(df)}")
print(df.head())

Total processed entries: 50000
                                          input_text  \
0    A fully differential calculation in perturba...   
1    We describe a new algorithm, the $(k,\ell)$-...   
2    The evolution of Earth-Moon system is descri...   
3    We show that a determinant of Stirling cycle...   
4    In this paper we show how to compute the $\L...   

                                         target_text  
0  Calculation of prompt diphoton production cros...  
1           Sparsity-certifying Graph Decompositions  
2  The evolution of the Earth-Moon system based o...  
3  A determinant of Stirling cycle numbers counts...  
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...  


In [None]:
# Split into train, validation, and test sets
train_df = df[:40000]  # First 40,000 for training
val_df = df[40000:45000]  # Next 5,000 for validation
test_df = df[45000:]  # Final 5,000 for testing

# Save as CSV files
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("Train, validation, and test sets saved as CSV!")

Train, validation, and test sets saved as CSV!


In [None]:
!mv train.csv '/content/drive/My Drive/Scientific Abstract Summarization/'
!mv val.csv '/content/drive/My Drive/Scientific Abstract Summarization/'
!mv test.csv '/content/drive/My Drive/Scientific Abstract Summarization/'

In [None]:
!pip install transformers



In [None]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/val.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/test.csv")

# Display dataset information
print("Train data size:", len(train_df))
print("Validation data size:", len(val_df))
print("Test data size:", len(test_df))

Train data size: 40000
Validation data size: 5000
Test data size: 5000


In [None]:
from transformers import PegasusTokenizer

# Load Pegasus tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

# Preprocessing function
def preprocess_text(dataframe, max_input_length=1024, max_target_length=128):
    inputs = []
    targets = []

    for _, row in dataframe.iterrows():
        # Tokenize input text (abstract) and truncate/pad
        input_text = tokenizer(
            row["input_text"],
            max_length=max_input_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        # Tokenize target text (title) and truncate/pad
        target_text = tokenizer(
            row["target_text"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        inputs.append(input_text)
        targets.append(target_text)

    return inputs, targets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

In [None]:
# Preprocess train data
train_inputs, train_targets = preprocess_text(train_df)

# Preprocess validation data
val_inputs, val_targets = preprocess_text(val_df)

# Preprocess test data
test_inputs, test_targets = preprocess_text(test_df)

print("Data preprocessing completed!")

Data preprocessing completed!


In [None]:
import torch

# Save preprocessed data as .pt (PyTorch tensor) files
torch.save((train_inputs, train_targets), "train_tokenized.pt")
torch.save((val_inputs, val_targets), "val_tokenized.pt")
torch.save((test_inputs, test_targets), "test_tokenized.pt")

print("Tokenized datasets saved!")

Tokenized datasets saved!


In [None]:
!mv train_tokenized.pt '/content/drive/My Drive/Scientific Abstract Summarization/'
!mv val_tokenized.pt '/content/drive/My Drive/Scientific Abstract Summarization/'
!mv test_tokenized.pt '/content/drive/My Drive/Scientific Abstract Summarization/'

In [None]:
# Check the first tokenized input and target
print("Sample tokenized input:", train_inputs[0])
print("Sample tokenized target:", train_targets[0])


Sample tokenized input: {'input_ids': tensor([[  202,  1069, 13945,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}
Sample tokenized target: {'input_ids': tensor([[57394,   113,  6712,  4218, 18580,   454,   889,  1891,  4201,   134,
         67240, 13368,   111, 75265, 15269,     1,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,    

# Model Design and Training

In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load the Pegasus model and tokenizer
model_name = "google/pegasus-large"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

print("Model and tokenizer loaded successfully!")

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Model and tokenizer loaded successfully!


In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=90aea9bd59e22d7b5cc8d45eec56567a3a00f52b2870c7972f34229a1e28306a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# Install required libraries
!pip install transformers datasets torch evaluate accelerate tqdm

# Import libraries
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.optim import AdamW
from evaluate import load
import pandas as pd
import os
from tqdm import tqdm  # For progress bars

# Enable memory optimization with environment variable
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load datasets
train_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/val.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/test.csv")

# Verify dataset sizes
print(f"Train data size: {len(train_df)}")
print(f"Validation data size: {len(val_df)}")
print(f"Test data size: {len(test_df)}")

# Initialize tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Define a custom dataset class
class PegasusDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_input_length=512, max_target_length=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        model_inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        labels = self.tokenizer(
            target_text,
            max_length=self.max_target_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": model_inputs["input_ids"].squeeze(0),
            "attention_mask": model_inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0),
        }

# Create datasets
train_dataset = PegasusDataset(train_df["input_text"], train_df["target_text"], tokenizer)
val_dataset = PegasusDataset(val_df["input_text"], val_df["target_text"], tokenizer)
test_dataset = PegasusDataset(test_df["input_text"], test_df["target_text"], tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Adjust batch size based on memory
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Load the model
model = PegasusForConditionalGeneration.from_pretrained(model_name)
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Load the ROUGE metric
rouge = load("rouge")

# Training loop
num_epochs = 3  # Train for more epochs for better performance
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    train_bar = tqdm(train_dataloader, desc="Training", leave=True)

    for batch in train_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        train_bar.set_postfix({"Loss": loss.item()})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    decoded_preds, decoded_labels = [], []

    print(f"\nEpoch {epoch + 1}/{num_epochs} Validation")
    val_bar = tqdm(val_dataloader, desc="Validating", leave=True)

    with torch.no_grad():
        for batch in val_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Compute loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Generate predictions
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            targets = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds.extend(preds)
            decoded_labels.extend(targets)

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation ROUGE Scores:", rouge_results)

# Save the model
output_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved successfully to {output_dir}")

# Evaluate on test dataset
test_dataloader = DataLoader(test_dataset, batch_size=4)
model.eval()
decoded_preds, decoded_labels = [], []
total_test_loss = 0

print("\nEvaluating on test dataset...")
test_bar = tqdm(test_dataloader, desc="Testing", leave=True)

with torch.no_grad():
    for batch in test_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Compute loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        total_test_loss += outputs.loss.item()

        # Generate predictions
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        targets = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds.extend(preds)
        decoded_labels.extend(targets)

avg_test_loss = total_test_loss / len(test_dataloader)
print(f"\nTest Loss: {avg_test_loss:.4f}")

# Compute ROUGE scores for test set
test_rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print("\nTest ROUGE Scores:", test_rouge_results)

Train data size: 40000
Validation data size: 5000
Test data size: 5000


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 10000/10000 [59:31<00:00,  2.80it/s, Loss=0.262]


Epoch 1/3, Training Loss: 0.5744

Epoch 1/3 Validation


Validating: 100%|██████████| 1250/1250 [19:17<00:00,  1.08it/s]


Epoch 1/3, Validation Loss: 0.2838
Epoch 1/3, Validation ROUGE Scores: {'rouge1': 0.413188973979714, 'rouge2': 0.23099091188095836, 'rougeL': 0.37689828229119154, 'rougeLsum': 0.37682674977380726}

Epoch 2/3


Training: 100%|██████████| 10000/10000 [59:31<00:00,  2.80it/s, Loss=0.227]


Epoch 2/3, Training Loss: 0.2861

Epoch 2/3 Validation


Validating: 100%|██████████| 1250/1250 [19:21<00:00,  1.08it/s]


Epoch 2/3, Validation Loss: 0.2700
Epoch 2/3, Validation ROUGE Scores: {'rouge1': 0.4188902636457766, 'rouge2': 0.2331537289480875, 'rougeL': 0.38055471609687885, 'rougeLsum': 0.3801851400703062}

Epoch 3/3


Training: 100%|██████████| 10000/10000 [59:31<00:00,  2.80it/s, Loss=0.45]


Epoch 3/3, Training Loss: 0.2625

Epoch 3/3 Validation


Validating: 100%|██████████| 1250/1250 [19:23<00:00,  1.07it/s]


Epoch 3/3, Validation Loss: 0.2641
Epoch 3/3, Validation ROUGE Scores: {'rouge1': 0.4190391689838655, 'rouge2': 0.2358307695685605, 'rougeL': 0.3818613533010472, 'rougeLsum': 0.38152789740839754}




Model and tokenizer saved successfully to /content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus

Evaluating on test dataset...


Testing: 100%|██████████| 1250/1250 [19:08<00:00,  1.09it/s]



Test Loss: 0.2671

Test ROUGE Scores: {'rouge1': 0.4209434536027694, 'rouge2': 0.2362803918701689, 'rougeL': 0.38329139867115747, 'rougeLsum': 0.38341236962296105}


# Evaluation

- Evaluate Performance

In [None]:
# Install necessary libraries
!pip install sacrebleu rouge-score nltk evaluate

# Import libraries
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from evaluate import load
import pandas as pd
import torch
from tqdm import tqdm
import nltk

# Download NLTK tokenizer for BLEU evaluation
nltk.download('punkt')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
model = PegasusForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = PegasusTokenizer.from_pretrained(model_path)

# Load the test dataset
test_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/test.csv")

# Prepare test data
test_inputs = test_df["input_text"].tolist()
test_targets = test_df["target_text"].tolist()

# Load ROUGE and BLEU metrics
rouge = load("rouge")
bleu = load("sacrebleu")

# Model evaluation
model.eval()
decoded_preds, decoded_labels = [], []

print("\nEvaluating model on test data...")
for input_text, target_text in tqdm(zip(test_inputs, test_targets), total=len(test_inputs), desc="Testing"):
    # Tokenize input text
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512).input_ids.to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_length=128, num_beams=8, length_penalty=0.6)
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Save predictions and references
    decoded_preds.extend(preds)
    decoded_labels.append(target_text)

# Evaluate ROUGE scores
rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print("\nROUGE Scores:")
for metric, score in rouge_results.items():
    print(f"{metric}: {score:.4f}")  # Directly print the score

# Evaluate BLEU scores
bleu_results = bleu.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])
print("\nBLEU Score:", bleu_results["score"])

# Save the predictions and references for human evaluation
output_df = pd.DataFrame({"Input": test_inputs, "Generated Summary": decoded_preds, "Reference Summary": decoded_labels})
output_df.to_csv("/content/drive/MyDrive/Scientific Abstract Summarization/test_predictions.csv", index=False)

print("\nPredictions and references saved to 'test_predictions.csv'.")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Evaluating model on test data...


Testing: 100%|██████████| 5000/5000 [36:54<00:00,  2.26it/s]



ROUGE Scores:
rouge1: 0.4197
rouge2: 0.2348
rougeL: 0.3821
rougeLsum: 0.3878

BLEU Score: 12.232242080689035

Predictions and references saved to 'test_predictions.csv'.


- Evaluate Summaries

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the saved tokenizer and model
output_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
tokenizer = PegasusTokenizer.from_pretrained(output_dir)
model = PegasusForConditionalGeneration.from_pretrained(output_dir)
model.eval()

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the test dataset
test_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/test.csv")

# Display some generated summaries and compare them to the references
print("\nDisplaying a few generated summaries with reference summaries:")
for i in range(3):  # Change the number here to display more/less examples
    input_text = test_df.iloc[i]["input_text"]
    reference_summary = test_df.iloc[i]["target_text"]

    # Generate summary
    inputs = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt").to(device)
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\nExample {i + 1}:")
    print(f"Input Article:\n{input_text}\n")
    print(f"Reference Summary:\n{reference_summary}\n")
    print(f"Generated Summary:\n{generated_summary}\n")

# Allow user to input an article for summary generation
while True:
    print("\n--- Enter a custom input article for summarization (type 'exit' to quit) ---")
    user_input = input("Your Input: ")

    if user_input.lower() == "exit":
        print("Exiting...")
        break

    # Generate summary for the user input
    user_inputs = tokenizer(user_input, max_length=512, truncation=True, return_tensors="pt").to(device)
    user_summary_ids = model.generate(user_inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)
    user_generated_summary = tokenizer.decode(user_summary_ids[0], skip_special_tokens=True)

    print("\nGenerated Summary:")
    print(user_generated_summary)



Displaying a few generated summaries with reference summaries:

Example 1:
Input Article:
  We investigate cosmological particle production in spacetimes where Lorentz
invariance emerges in the infrared limit, but is explicitly broken in the
ultraviolet regime. Our specific model focuses on the boost subgroup that
supports CPT invariance and results in a momentum-dependent dispersion
relation. Motivated by previous studies on spacetimes emerging from a
microscopic substrate, we show how these modifications naturally lead to
momentum-dependent rainbow metrics. Firstly, we investigate the possibility of
reproducing cosmological particle production in spacetimes emerging from real
Bose gases. We have studied the influence of non-perturbative ultraviolet
corrections in time-dependent analogue spacetimes, leading to
momentum-dependent emergent rainbow spacetimes. Within certain limits the
analogy is sufficiently good to simulate relativistic quantum field theory in
time-dependent classical

- https://www.cnn.com/2024/11/29/politics/trump-federal-agency-dc-blm/index.html

- https://www.theverge.com/24306534/black-friday-2024-streaming-best-deals-max-hulu-peacock-paramount-plus-cyber-monday

- https://www.reuters.com/world/uk/bank-england-warns-risks-rise-global-trade-barriers-2024-11-29/

- https://m.imdb.com/news/movie/

-https://www.espn.com/nfl/story/_/id/42664634/nfl-week-13-chicago-bears-lose-detroit-lions-thanksgiving-day

Scientific Papers

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the saved tokenizer and model
output_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
tokenizer = PegasusTokenizer.from_pretrained(output_dir)
model = PegasusForConditionalGeneration.from_pretrained(output_dir)
model.eval()

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the test dataset
test_df = pd.read_csv("/content/drive/MyDrive/Scientific Abstract Summarization/test.csv")

# Display some generated summaries and compare them to the references
print("\nDisplaying a few generated summaries with reference summaries:")
for i in range(3):  # Change the number here to display more/less examples
    input_text = test_df.iloc[i]["input_text"]
    reference_summary = test_df.iloc[i]["target_text"]

    # Generate summary
    inputs = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt").to(device)
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\nExample {i + 1}:")
    print(f"Input Article:\n{input_text}\n")
    print(f"Reference Summary:\n{reference_summary}\n")
    print(f"Generated Summary:\n{generated_summary}\n")

# Allow user to input an article for summary generation
while True:
    print("\n--- Enter a custom input article for summarization (type 'exit' to quit) ---")
    user_input = input("Your Input: ")

    if user_input.lower() == "exit":
        print("Exiting...")
        break

    # Generate summary for the user input
    user_inputs = tokenizer(user_input, max_length=512, truncation=True, return_tensors="pt").to(device)
    user_summary_ids = model.generate(user_inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)
    user_generated_summary = tokenizer.decode(user_summary_ids[0], skip_special_tokens=True)

    print("\nGenerated Summary:")
    print(user_generated_summary)



Displaying a few generated summaries with reference summaries:

Example 1:
Input Article:
  We investigate cosmological particle production in spacetimes where Lorentz
invariance emerges in the infrared limit, but is explicitly broken in the
ultraviolet regime. Our specific model focuses on the boost subgroup that
supports CPT invariance and results in a momentum-dependent dispersion
relation. Motivated by previous studies on spacetimes emerging from a
microscopic substrate, we show how these modifications naturally lead to
momentum-dependent rainbow metrics. Firstly, we investigate the possibility of
reproducing cosmological particle production in spacetimes emerging from real
Bose gases. We have studied the influence of non-perturbative ultraviolet
corrections in time-dependent analogue spacetimes, leading to
momentum-dependent emergent rainbow spacetimes. Within certain limits the
analogy is sufficiently good to simulate relativistic quantum field theory in
time-dependent classical

- https://sci-hub.ru/https://doi.org/10.1109/EMBC.2012.6346760

- https://sci-hub.ru/https://doi.org/10.1109/RTEICT.2017.8256758

- https://sci-hub.ru/https://doi.org/10.1109/IEMBS.2002.1053320



- Upload and Summarize PDF

Yoga for depression: The research evidence

In [None]:
# Install necessary libraries
!pip install PyPDF2 transformers

# Import required libraries
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
import PyPDF2
from google.colab import files

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned Pegasus model and tokenizer
output_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
tokenizer = PegasusTokenizer.from_pretrained(output_dir)
model = PegasusForConditionalGeneration.from_pretrained(output_dir)
model.to(device)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to summarize text using Pegasus
def summarize_text(text, max_input_length=512, max_summary_length=128):
    # Tokenize input text
    inputs = tokenizer(
        text,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_summary_length,
        num_beams=5,
        length_penalty=2.0,
        early_stopping=True
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Upload the PDF
uploaded_file = files.upload()

# Extract text from the uploaded PDF
pdf_path = list(uploaded_file.keys())[0]
pdf_text = extract_text_from_pdf(pdf_path)
print("\nExtracted Text:\n", pdf_text[:500], "...")  # Print the first 500 characters of the text

# Summarize the extracted text
summary = summarize_text(pdf_text)
print("\nGenerated Summary:\n", summary)

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


Saving 1-s2.0-S0165032705002570-main.pdf to 1-s2.0-S0165032705002570-main.pdf

Extracted Text:
 Review
Yoga for depression: The research evidence
Karen Pilkingtona,b,*, Graham Kirkwooda,1, Hagen Rampesc, Janet Richardsona,d
aResearch Council for Complementary Medicine, London, UK
bSchool of Integrated Health, University of Westminster, 115 New Cavendish Street, London W1W 6UW, UK
cBarnet, Enfield and Haringey Mental Health NHS Trust, Northwest Community Mental Health Team, Edgware, Middlesex, UK
dHealth and Social Work, University of Plymouth and Research Council for Complementary Medicine ...

Generated Summary:
 Systematic review of research evidence on the effectiveness of yoga for the treatment of depression


AI ethics in computational psychiatry: From the neuroscience of
consciousness to the ethics of consciousness

In [None]:
# Install necessary libraries
!pip install PyPDF2 transformers

# Import required libraries
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
import PyPDF2
from google.colab import files

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned Pegasus model and tokenizer
output_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
tokenizer = PegasusTokenizer.from_pretrained(output_dir)
model = PegasusForConditionalGeneration.from_pretrained(output_dir)
model.to(device)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to summarize text using Pegasus
def summarize_text(text, max_input_length=512, max_summary_length=128):
    # Tokenize input text
    inputs = tokenizer(
        text,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_summary_length,
        num_beams=5,
        length_penalty=2.0,
        early_stopping=True
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Upload the PDF
uploaded_file = files.upload()

# Extract text from the uploaded PDF
pdf_path = list(uploaded_file.keys())[0]
pdf_text = extract_text_from_pdf(pdf_path)
print("\nExtracted Text:\n", pdf_text[:500], "...")  # Print the first 500 characters of the text

# Summarize the extracted text
summary = summarize_text(pdf_text)
print("\nGenerated Summary:\n", summary)



Saving 1-s2.0-S0166432821005921-main.pdf to 1-s2.0-S0166432821005921-main.pdf

Extracted Text:
 Behavioural Brain Research 420 (2022) 113704
Available online 4 December 2021
0166-4328/© 2021 The Authors. Published by Elsevier B.V. This is an open access article under the CC BY license ( http://creativecommons.org/licenses/by/4.0/ ).AI ethics in computational psychiatry: From the neuroscience of 
consciousness to the ethics of consciousness 
Wanja Wiesea,*, Karl J. Fristonb 
aInstitute of Philosophy II, Ruhr University Bochum, Universit atsstra ße 150, 44780 Bochum, Germany 
bWellcome Cent ...

Generated Summary:
 Ethical considerations from AI ethics in computational psychiatry


A polymerase chain reaction experiment using Escherichia coli and Mars
sand simulant for detection and analysis of extraterrestrial life

In [None]:
# Install necessary libraries
!pip install PyPDF2 transformers

# Import required libraries
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
import PyPDF2
from google.colab import files

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned Pegasus model and tokenizer
output_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"
tokenizer = PegasusTokenizer.from_pretrained(output_dir)
model = PegasusForConditionalGeneration.from_pretrained(output_dir)
model.to(device)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to summarize text using Pegasus
def summarize_text(text, max_input_length=512, max_summary_length=128):
    # Tokenize input text
    inputs = tokenizer(
        text,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_summary_length,
        num_beams=5,
        length_penalty=2.0,
        early_stopping=True
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Upload the PDF
uploaded_file = files.upload()

# Extract text from the uploaded PDF
pdf_path = list(uploaded_file.keys())[0]
pdf_text = extract_text_from_pdf(pdf_path)
print("\nExtracted Text:\n", pdf_text[:500], "...")  # Print the first 500 characters of the text

# Summarize the extracted text
summary = summarize_text(pdf_text)
print("\nGenerated Summary:\n", summary)



Saving 1-s2.0-S2214552424000610-main.pdf to 1-s2.0-S2214552424000610-main.pdf

Extracted Text:
 Life Sciences in Space Research 42 (2024) 84–90
Available online 23 May 2024
2214-5524/© 2024 The Committee on Space Research (COSPAR). Published by Elsevier B.V. This is an open access article under the CC BY-NC-ND license
(http://creativecommons.org/licenses/by-nc-nd/4.0/ ).A polymerase chain reaction experiment using Escherichia coli and Mars 
sand simulant for detection and analysis of extraterrestrial life 
Keigo Enyaa,b,*, Satoshi Sasakic, Taiki Kuniedac 
aInstitute of Space & Astronautica ...

Generated Summary:
 A chain reaction experiment using Escherichia coli and Mars sand simulant for detection and analysis of extraterrestrial life


# Code for Deployment Readiness

- Export the Model

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Define the model directory where the fine-tuned model is saved
fine_tuned_model_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/fine_tuned_pegasus"

# Define the directory to save the deployment-ready model and tokenizer
deployment_model_dir = "/content/drive/MyDrive/Scientific Abstract Summarization/deployment_model"

# Load the fine-tuned model and tokenizer
print("Loading the fine-tuned model and tokenizer...")
tokenizer = PegasusTokenizer.from_pretrained(fine_tuned_model_dir)
model = PegasusForConditionalGeneration.from_pretrained(fine_tuned_model_dir)

# Save the model and tokenizer in the Hugging Face format for deployment
print(f"Saving the deployment-ready model to: {deployment_model_dir}")
model.save_pretrained(deployment_model_dir)
tokenizer.save_pretrained(deployment_model_dir)

print(f"Model and tokenizer successfully saved to {deployment_model_dir}")

Loading the fine-tuned model and tokenizer...
Saving the deployment-ready model to: /content/drive/MyDrive/Scientific Abstract Summarization/deployment_model
Model and tokenizer successfully saved to /content/drive/MyDrive/Scientific Abstract Summarization/deployment_model
