In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
submission_df = pd.read_csv('/kaggle/input/document-summarization-llms/submission_format.csv')
test_df = pd.read_csv('/kaggle/input/document-summarization-llms/test_features.csv')
train_df = pd.read_csv('/kaggle/input/document-summarization-llms/train.csv')

In [3]:
submission_df.head()

Unnamed: 0,paper_id,summary
0,1000,my very accurate summary
1,1001,my very accurate summary
2,1002,my very accurate summary
3,1003,my very accurate summary
4,1004,my very accurate summary


In [4]:
train_df.head()

Unnamed: 0,paper_id,text,summary
0,0,## FROM SOVEREIGNTY TO EXTRATERRITORIAL CONSCI...,"In this article, Victor Fan argues that analys..."
1,1,## 1. Introduction\n\n\nAn Electronic Health R...,Problem definition: Physicians spend more than...
2,2,## Introduction\n\n\nTranslation plays an i...,Literary translation is one of the most challe...
3,3,## 1 Problem Setup\n\n\nRecent political scien...,There is a long-running debate on evaluating f...
4,4,## INTRODUCTION\n\n\nThis article investigat...,"Recently, ‘bimajyo’ (美魔女) came into focus in J..."


In [5]:
train_df.text[0][:1000]

"## FROM SOVEREIGNTY TO EXTRATERRITORIAL CONSCIOUSNESS\n\n\nSince 1997, the concept of extraterritoriality has been configured in the political tension between Hong Kong and Beijing. From the perspective of the Central Government, it is fundamental for the people of China to shijian zhuquan instantiate its sovereignty over Hong Kong. But while most Hong / Kong residents insist on interpreting this concept in terms of the Euro-American notion of selfdetermination  (zizhu / making decisions  for  oneself),  the  Beijing  government  believes  that  the Hong Kong legislature must make decisions in conformation to the larger will of the people, which the Party represents, a concept taken from the writings of Lenin and Stalin (Gao 2010: 26-30). This tension is crystalised in the long debate about Article 23 of the Hong Kong Basic Law, which requires  the  SAR  to  'enact  laws  on  its  own  to  prohibit  any  act  of  treason,  secession,  sedition, subversion  against  the  Central  Peopl

In [6]:
train_df.summary[0]

'In this article, Victor Fan argues that analysing contemporary Hong Kong cinema requires active rewriting of established postcolonial theories by taking into account the specific mode of colonisation of Hong Kong: extraterritoriality. This concept has been responsible for the construction of the cultural plurality, linguistic ambiguity, and political liminality of Hong Kong and its cinematographic experience, as well as the incongruence between the community’s political consciousness after 1997 and the larger national imagination promulgated by the Beijing government. The term ‘extraterritoriality’ was translated into Chinese after 1895 via Japanese as zhiwai faquan the right to exercise one’s law outside a nation state’s sovereign terrain, and colonialism in China between 1844 and 1949 was largely characterized by a continuous reformulation and systematisation of this concept. It in fact still informs the way former colonised regions in China are administered today, and the political

In [7]:
test_df.head()

Unnamed: 0,paper_id,text
0,1000,## Introduction\n\n\nGender disparities persis...
1,1001,## Introduction\n\n\nOne of humanity’s greates...
2,1002,## Introduction\n\n\nHow do workers get attach...
3,1003,## BETWEEN INDEXES AND SYMBOLS: AN EXPRESSION ...
4,1004,## The Evolution of Environmental and Climate ...


In [8]:
len(train_df)

1000

In [9]:
len(test_df)

345

In [10]:
%%capture
!pip install transformers rouge_score

In [16]:
import pandas as pd
from rouge_score import rouge_scorer
import markdown
import torch
from bs4 import BeautifulSoup
from tqdm import tqdm

In [17]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [18]:
from transformers import BartTokenizer, BartForConditionalGeneration, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
import torch.nn as nn
import numpy as np
import time
import datetime
import torch

# Custom Dataset class
class SummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=1024):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        summary = str(self.summaries[idx])
        
        text_encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        summary_encoding = self.tokenizer(
            summary,
            max_length=150,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': text_encoding['input_ids'].flatten(),
            'attention_mask': text_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

# Initialize tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

# Create dataset and split into train/val
dataset = SummaryDataset(train_df['text'].values, train_df['summary'].values, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 4  # Reduce batch size if you get CUDA out of memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Training setup
epochs = 24
optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Early stopping setup
early_stopping_patience = 3
early_stopping_counter = 0
best_val_loss = float('inf')

# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)
    
    # Training
    model.train()
    total_train_loss = 0
    
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Training loss: {avg_train_loss}')
    
    # Validation
    model.eval()
    total_val_loss = 0
    
    for batch in tqdm(val_loader, desc="Validation"):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Validation loss: {avg_val_loss}')
    
    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stopping_counter = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pt')
        print("Saved best model")
    else:
        early_stopping_counter += 1
        print(f"Early stopping counter: {early_stopping_counter}/{early_stopping_patience}")
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Epoch 1/24
----------


Training: 100%|██████████| 200/200 [04:04<00:00,  1.22s/it]


Training loss: 2.7905342614650728


Validation: 100%|██████████| 50/50 [00:28<00:00,  1.78it/s]


Validation loss: 2.5261825799942015
Saved best model
Epoch 2/24
----------


Training: 100%|██████████| 200/200 [04:03<00:00,  1.22s/it]


Training loss: 2.2231368523836137


Validation: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Validation loss: 2.538730778694153
Early stopping counter: 1/3
Epoch 3/24
----------


Training: 100%|██████████| 200/200 [04:04<00:00,  1.22s/it]


Training loss: 1.7938468545675277


Validation: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Validation loss: 2.6490711092948915
Early stopping counter: 2/3
Epoch 4/24
----------


Training: 100%|██████████| 200/200 [04:04<00:00,  1.22s/it]


Training loss: 1.421841774880886


Validation: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]
  model.load_state_dict(torch.load('best_model.pt'))


Validation loss: 2.815855100154877
Early stopping counter: 3/3
Early stopping triggered


<All keys matched successfully>

In [19]:
# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True, padding=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, 
                               length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for test set
test_summaries = [generate_summary(text) for text in tqdm(test_df['text'], desc="Generating summaries")]

# Prepare submission
submission_df = pd.DataFrame({
    'paper_id': test_df['paper_id'],
    'summary': test_summaries
})

Generating summaries: 100%|██████████| 345/345 [10:17<00:00,  1.79s/it]


In [20]:
# Save to CSV
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'


In [21]:
submission_df.head()

Unnamed: 0,paper_id,summary
0,1000,Gender disparities persist in how men and wome...
1,1001,Climate change mitigation cannot occur without...
2,1002,Employee testimonials are personal narratives ...
3,1003,Despite the importance of the concept of “comm...
4,1004,This study explores how childbirth is associat...


In [22]:
submission_df.summary[0]

'Gender disparities persist in how men and women divide paid work, housework and childcare responsibilities. Despite rising levels of education and labor force participation among women, they continue to perform a larger portion of paid work and childcare compared to their male partners. This gendered division of labor remains prevalent, even as traditional views on gender roles have declined in postindustrial economies (Grunow et al., 2018). While women have increasingly entered the workforce in the past decades, this shift has not been met with a corresponding increase in men’s contribution to household tasks, particularly in childcare. Scholars have reviewed theoretical approaches to explain these continuing disparities. Using the concept of gender ideology as an analytical heuristic tool, we propose a theoretical model to explore support'

In [23]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from pathlib import Path

class SummaryGenerator:
    def __init__(self, model_path="best_model.pt"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
        self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(self.device)
        
        # Load your trained weights
        if Path(model_path).exists():
            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
            print(f"Loaded trained model weights from {model_path}")
        else:
            print("Using pretrained weights (no fine-tuning)")
        
        self.model.eval()
    
    def summarize(self, text, max_length=150, min_length=40):
        """
        Generate a summary for the input text
        
        Args:
            text (str): Input text to summarize
            max_length (int): Maximum length of summary
            min_length (int): Minimum length of summary
            
        Returns:
            str: Generated summary
        """
        inputs = self.tokenizer(
            text,
            max_length=1024,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        ).to(self.device)
        
        summary_ids = self.model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            min_length=min_length,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
if __name__ == "__main__":
    # Initialize the summarizer
    summarizer = SummaryGenerator()
    
    # Example input text (replace with your own)
    input_text = """
    Artificial intelligence (AI) is transforming industries across the globe. 
    From healthcare to finance, AI applications are enabling faster and more accurate decision-making. 
    In healthcare, AI algorithms can analyze medical images to detect diseases earlier than human doctors. 
    Financial institutions use AI for fraud detection and algorithmic trading. 
    Despite these advances, ethical concerns about AI bias and job displacement remain significant challenges.
    """
    
    # Generate and print the summary
    summary = summarizer.summarize(input_text)
    print("\nGenerated Summary:")
    print(summary)

  self.model.load_state_dict(torch.load(model_path, map_location=self.device))


Loaded trained model weights from best_model.pt

Generated Summary:
Artificial intelligence (AI) is transforming industries across the globe. From healthcare to finance, AI applications are enabling faster and more accurate decision-making. Despite these advances, ethical concerns about AI bias and job displacement remain significant challenges.
