## Loading Libraries

In [1]:
import pandas as pd
import numpy as np

# # Setting options to display the full DataFrame content
# pd.set_option('display.max_columns', None)  # Shows all columns
# pd.set_option('display.max_rows', None)     # Shows all rows
# pd.set_option('display.max_colwidth', None) # Shows full width of showing columns
# pd.set_option('display.width', None)        # Auto-detects the width of the terminal

In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m0.9/1.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=15e82e680600ccab06d44f8d795574da84fb4bc64d7d9e52d756f7e84dbedecc
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [4]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


## Data Frame Generation

### Collecting Data Paths

In [None]:
base_path = '/content/drive/Othercomputers/My Laptop/NEU/Fall_23/NLP/Project/Data/scisummnet_release1.1__20190413/top1000_complete'

In [None]:
import os

file_paths = []
for folder_name in os.listdir(base_path):
    xml_folder_path = os.path.join(base_path, folder_name, 'Documents_xml')
    xml_file_path = os.path.join(xml_folder_path, folder_name + '.xml')
    if os.path.exists(xml_file_path):
        file_paths.append(xml_file_path)

In [None]:
summary_paths = []
for folder_name in os.listdir(base_path):
    sum_folder_path = os.path.join(base_path, folder_name, 'summary')
    sum_file_path = os.path.join(sum_folder_path, folder_name + '.gold'+'.txt')
    if os.path.exists(sum_file_path):
        summary_paths.append(sum_file_path)

### Loading data

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def process_summary(file_path):
    with open(file_path, 'r') as file:
        content = [line.strip() for line in file.readlines()[1:]]
    return ' '.join(content)

# Function to process an individual XML file
def process_xml(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract the paper title (assuming it's the first <S> tag)
    title = root.find('.//S').text

    # Initialize a string to hold the rest of the paper content
    content = ''

    # Extract the text from all <S> tags except the first one (which is the title)
    for s_tag in root.findall('.//S')[1:]:
        text = s_tag.text
        if text:
            content += text + ' '  # Adding a space for separation between sections

    return title, content

In [None]:
# Process each file and store the results in a DataFrame
file_paths = sorted(file_paths)
summary_paths = sorted(summary_paths)
data = [process_xml(file_path) for file_path in file_paths]
summary_data = [process_summary(file_path) for file_path in summary_paths]
df = pd.DataFrame(data, columns=['title', 'content'])
df['summary'] = summary_data

In [None]:
# df.to_csv('Summarizer_Data_Final.csv', index=False)

- this code works but it takes a lot of time because parsing and going through different layers of file location is required, so created the above cell to store the final df as csv so we can use it on the go rather than running that again and again and wasting time

## Pegasus Fine Tuned

- if you dont have the csv file run above chunk or just run the df_generator file and don't forget to rename the file location that you are about to save

In [5]:
df = pd.read_csv("/content/drive/Othercomputers/My Laptop/NEU/Fall_23/NLP/Project/Data/Summarizer_Data-Final.csv", error_bad_lines=False)



  df = pd.read_csv("/content/drive/Othercomputers/My Laptop/NEU/Fall_23/NLP/Project/Data/Summarizer_Data-Final.csv", error_bad_lines=False)


### Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_summary, val_summary = train_test_split(
    df['content'], df['summary'], test_size=0.05)  # 10% for validation

### Tokenization

* our main aim here is to convert words to tokens so that we can feed it into model and for that we are using pegasus transformer tokenizer which already has pre-existing corpus and has its own weights so we will be just fine tuning that

* **train_texts.tolist()** and **val_texts.tolist()** convert the pandas series to lists, which is the required format for the tokenizer.
* **truncation=True** ensures that if a text is longer than the maximum length the model can handle (max_length), it will be truncated to fit.
* **padding=True** ensures that all tokenized outputs are padded to have the same length, which is necessary for batch processing.
* **max_length=512** sets the maximum number of tokens. This is a typical choice for transformer models, balancing detail with computational efficiency.

In [7]:
from transformers import PegasusTokenizer

model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

In [8]:
train_texts = train_texts.fillna("").tolist()
train_summary = train_summary.fillna("").tolist()
val_texts = val_texts.fillna("").tolist()
val_summary = val_summary.fillna("").tolist()

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding='longest', max_length=1024, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding='longest', max_length=1024, return_tensors="pt")
train_labels = tokenizer(train_summary, truncation=True, padding='longest', max_length=256, return_tensors="pt")
val_labels = tokenizer(val_summary, truncation=True, padding='longest', max_length=256, return_tensors="pt")

- The specific nature of our research papers data and their summaries, when setting the max_length parameter during tokenization.
- Research papers are typically much longer than the texts models like Pegasus are usually trained on. (usually we set as 512 but here we will be considering 1024)
- Similarly, summaries of research papers can also be lengthier than typical summary lengths. (usually we set as 128 but here we will be considering 256 or 512)

### Custom Dataset Creation

- A custom class SummaryDataset is defined, which inherits from **torch.utils.data.Dataset**. because this is a standard way in PyTorch to create a dataset that can be used with data loaders for model training.

- __init__()  is the constructor of the SummaryDataset class. It initializes the dataset with encodings and labels.

- The __getitem__ method is a required method for any subclass of torch.utils.data.Dataset. It defines how to access a single item from the dataset.
idx is the index of the item to retrieve.
- For each item, the method extracts the input encodings and the corresponding labels, converting them into PyTorch tensors (which are the standard data structure used in PyTorch for inputs and outputs).
- The input encodings (self.encodings) are a dictionary where keys are types of encoding like input_ids, attention_mask, etc., and values are lists of encoded tokens. This method retrieves the appropriate encoding for the given index (idx) and packs it into a new dictionary item.
- The labels are similarly extracted from self.labels['input_ids'] (the token ids representing the summary) for the given index and added to the item dictionary under the key 'labels'.

In [10]:
import torch

class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SummaryDataset(train_encodings, train_labels)
val_dataset = SummaryDataset(val_encodings, val_labels)

### Loading Pre-Trained Model

- PegasusForConditionalGeneration is a specific class within the transformers library designed for sequence-to-sequence tasks, which include tasks like summarization where the goal is to generate a sequence (summary) based on another sequence (document).

In [11]:
from transformers import PegasusForConditionalGeneration

model_fine_tuned = PegasusForConditionalGeneration.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

### Fine Tune

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=600,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)
device = torch.device("cpu")
model_fine_tuned.to(device)
trainer = Trainer(
    model=model_fine_tuned,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()


Step,Training Loss
10,4.9374
20,5.2186
30,5.4165
40,5.3945
50,5.1318
60,4.9922
70,4.7494
80,5.8087
90,4.7404
100,4.8108


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()


TrainOutput(global_step=1200, training_loss=1.8156745779886843, metrics={'train_runtime': 419.7173, 'train_samples_per_second': 2.859, 'train_steps_per_second': 2.859, 'total_flos': 3467357297049600.0, 'train_loss': 1.8156745779886843, 'epoch': 600.0})

In [19]:
# import pickle
# import torch
# torch.save(model_fine_tuned.state_dict(), '/content/drive/MyDrive/Project Data/model_fine_tuned.pth')
# with open('tokenizer.pkl', 'wb') as file:
#     pickle.dump(tokenizer, file)

### Evaluate Model

In [13]:
results = trainer.evaluate()
print(results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()


{'eval_loss': 1.3807437419891357, 'eval_runtime': 0.189, 'eval_samples_per_second': 10.584, 'eval_steps_per_second': 10.584, 'epoch': 600.0}


In [14]:
def generate_summaries(model, tokenizer, texts, device):
    model.to(device)
    summaries = []

    for text in texts:
        encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        generated_ids = model.generate(encoded_input['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

In [15]:
reference_summaries = [tokenizer.decode(labels, skip_special_tokens=True) for labels in val_labels['input_ids']]
generated_summaries = generate_summaries(model_fine_tuned, tokenizer, val_texts, device)

In [16]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, gen) for ref, gen in zip(reference_summaries, generated_summaries)]

In [17]:
average_scores = {
    'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
    'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
    'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
}

print("Average ROUGE Scores:", average_scores)

Average ROUGE Scores: {'rouge1': 0.580995905251507, 'rouge2': 0.4471148563045786, 'rougeL': 0.5011477431583253}


- This is just a working example just to confirm whether the summarization capabilities of our model
- A working front-end will be deployed where you can test it out.......

In [18]:
texts_to_summarize = [
    "Climate change refers to significant, long-term changes in the global climate. The global climate is a connected system that is always in motion, and it is being affected by human activities. One of the most noticeable effects of climate change in the past century has been the increase in temperature around the world. The average global temperature has increased by about 1.1 to 1.2 degrees Celsius since 1900. This change has led to a wide range of impacts on the environment, ecosystems, and human societies. One of the primary causes of climate change is the release of greenhouse gases into the Earth's atmosphere. These gases, such as carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), trap heat from the sun, leading to a warming effect known as the greenhouse effect. The majority of these emissions come from human activities, including the burning of fossil fuels for energy, deforestation, and industrial processes. The consequences of climate change are far-reaching and diverse. One of the most critical impacts is the rise in sea levels caused by the melting of polar ice caps and glaciers, as well as the expansion of seawater as it warms. This rise in sea levels poses a significant threat to coastal communities and islands. Additionally, climate change has been linked to more frequent and severe weather events, such as hurricanes, droughts, heatwaves, and heavy rainfall. Ecosystems are also being affected by climate change. Shifts in temperature and weather patterns can disrupt the natural habitats of many species, leading to changes in biodiversity. Some species may become extinct if they cannot adapt quickly enough to these changes. Furthermore, climate change can exacerbate existing environmental problems, such as habitat destruction and pollution, making it even harder for ecosystems to maintain their balance. The impacts of climate change extend to human societies as well. These impacts include threats to food and water supplies, increased risks to health, economic consequences, and potential displacement of populations. For instance, changes in precipitation patterns and temperature can affect crop yields, leading to food shortages and increased prices. Warmer temperatures can also contribute to the spread of diseases. Addressing climate change requires coordinated global action. This includes reducing greenhouse gas emissions, transitioning to renewable energy sources, and protecting and restoring forests. Additionally, societies need to adapt to the changes that are already underway. This involves building resilient infrastructure, developing sustainable agricultural practices, and planning for potential climate-related disasters. In conclusion, climate change is a complex and urgent issue that impacts the entire planet. It demands immediate and sustained action to mitigate its effects and safeguard the future of the environment and human societies."
]
inputs = tokenizer(texts_to_summarize, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
# Move inputs to the same device as the model
inputs = inputs.to(model_fine_tuned.device)

# Generate summaries
summary_ids = model_fine_tuned.generate(inputs['input_ids'], max_length=256, num_beams=4, early_stopping=True)

# Decode generated summaries back to text
summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
for i, summary in enumerate(summaries):
    print(f"Summary {i+1}:\n{summary}\n")


Summary 1:
One of the most noticeable effects of climate change in the past century has been the increase in temperature around the world. The average global temperature has increased by about 1.1 to 1.2 degrees Celsius since 1900. This change has led to a wide range of impacts on the environment, ecosystems, and human societies. One of the primary causes of climate change is the release of greenhouse gases into the Earth's atmosphere. The majority of these emissions come from human activities, including the burning of fossil fuels for energy, deforestation, and industrial processes. One of the most critical impacts is the rise in sea levels caused by the melting of polar ice caps and glaciers, as well as the expansion of seawater as it warms. Shifts in temperature and weather patterns can disrupt the natural habitats of many species, leading to changes in biodiversity. The impacts of climate change extend to human societies as well. For instance, changes in precipitation patterns and 