# CompScholar Research Paper Summarization
## Complete Pipeline with Hybrid Section Markers


In [1]:
!pip install transformers datasets evaluate rouge-score nltk sentencepiece

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=61d402f5142ab44fee2da7002da7e4d23a209701570119d88a4719b8ab71bb3a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.3 rouge-score-0.1.2


In [2]:
import re
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Step 1: Data Loading & Preprocessing
Enhanced cleaning and section marking

In [3]:
def preprocess_compscholar(df):
    """Full preprocessing pipeline for CompScholar data"""
    # Handle missing values
    df = df.fillna('')
    
    # Clean text components
    def clean_text(text):
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = re.sub(r'[^\w\s.,;:!?()-]', '', text)  # Remove special chars
        return text.strip()
    
    # Build structured document with section markers
    structured_docs = []
    for _, row in df.iterrows():
        sections = [
            f"<title>{clean_text(row['Paper Title'])}</title>",
            f"<keywords>{clean_text(row['Key Words'])}</keywords>",
            f"<abstract>{clean_text(row['Abstract'])}</abstract>",
            f"<conclusion>{clean_text(row['Conclusion'])}</conclusion>"
        ]
        structured_docs.append('\n\n'.join(sections))
    
    df['structured_text'] = structured_docs
    df['clean_summary'] = df['Summary'].apply(clean_text)
    return df

# Load and preprocess dataset
full_df = pd.read_csv('/kaggle/input/compscholar/Brain Dead CompScholar Dataset.csv')
processed_df = preprocess_compscholar(full_df)

# Train/Val/Test split (80/10/10)
train_df = processed_df.sample(frac=0.8, random_state=42)
val_test_df = processed_df.drop(train_df.index)
val_df = val_test_df.sample(frac=0.5, random_state=42)
test_df = val_test_df.drop(val_df.index)

# Create Hugging Face datasets
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df)
})

## Step 2: Tokenization with Section Awareness
Special tokens for document structure

In [4]:
model_name = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add special tokens for section markers
new_tokens = ['<title>', '</title>', '<keywords>', '</keywords>', 
              '<abstract>', '</abstract>', '<conclusion>', '</conclusion>']
tokenizer.add_tokens(new_tokens)

max_input_length = 1024
max_target_length = 256

def tokenize_function(examples):
    """Tokenization with section structure preservation"""
    inputs = [text for text in examples['structured_text']]
    targets = [text for text in examples['clean_summary']]
    
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding='max_length',
        add_special_tokens=True
    )
    
    labels = tokenizer(
        targets,
        max_length=max_target_length,
        truncation=True,
        padding='max_length'
    )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/297 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

## Step 3: Model Initialization
 BART model with extended vocabulary

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))  # For new section tokens

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


BartScaledWordEmbedding(50273, 1024, padding_idx=1)

## Step 4: Training Configuration
Optimized for small dataset

In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,  # More epochs for small dataset
    predict_with_generate=True,
    fp16=True,
    logging_steps=100,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    report_to='none'
)

# Initialize metrics
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

def compute_metrics(eval_pred):
    """Enhanced metric calculation with length penalty"""
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE with stemming
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        use_aggregator=True
    )
    
    # BLEU with smoothing
    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels],
        max_order=4,
        smooth=True
    )
    
    return {
        'rouge1': round(rouge_result['rouge1'], 3),
        'rouge2': round(rouge_result['rouge2'], 3),
        'rougeL': round(rouge_result['rougeL'], 3),
        'bleu': round(bleu_result['bleu'], 3)
    }





Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

## Step 5: Model Training
With early stopping

In [7]:

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu
1,No log,1.068682,0.61,0.352,0.446,0.274
2,No log,0.521526,0.609,0.348,0.442,0.288
3,1.928700,0.516122,0.616,0.363,0.452,0.29
4,1.928700,0.532636,0.597,0.339,0.431,0.273
5,1.928700,0.621746,0.615,0.359,0.453,0.29
6,0.199500,0.671185,0.623,0.365,0.46,0.288
7,0.199500,0.743546,0.618,0.364,0.452,0.296
8,0.066500,0.784812,0.603,0.352,0.438,0.284
9,0.066500,0.849448,0.619,0.361,0.448,0.287
10,0.066500,0.838682,0.61,0.349,0.437,0.279


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=555, training_loss=0.40411410621694616, metrics={'train_runtime': 2347.6778, 'train_samples_per_second': 1.898, 'train_steps_per_second': 0.236, 'total_flos': 9409568182370304.0, 'train_loss': 0.40411410621694616, 'epoch': 14.613333333333333})

## Step 6: Evaluation & Results

In [8]:

test_results = trainer.evaluate(tokenized_dataset['test'])
print("\nFinal Test Results:")
print(f"ROUGE-1: {test_results['eval_rouge1']:.3f}")
print(f"ROUGE-2: {test_results['eval_rouge2']:.3f}")
print(f"ROUGE-L: {test_results['eval_rougeL']:.3f}")
print(f"BLEU: {test_results['eval_bleu']:.3f}")

def generate_summary(text):
    """Production-ready inference function"""
    inputs = tokenizer(
        text,
        max_length=max_input_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    ).to(model.device)
    
    summary_ids = model.generate(
        inputs.input_ids,
        num_beams=6,
        length_penalty=1.2,
        max_length=max_target_length,
        early_stopping=True
    )
    
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
sample = dataset['test'][0]
print("\nInput Document:")
print(sample['structured_text'][:500] + "...")
print("\nGenerated Summary:")
print(generate_summary(sample['structured_text']))
print("\nReference Summary:")
print(sample['clean_summary'])

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



Final Test Results:
ROUGE-1: 0.619
ROUGE-2: 0.351
ROUGE-L: 0.432
BLEU: 0.291

Input Document:
<title>Cardiovascular Disease and Risk Factors in Asia A Selected Review</title>

<keywords>Cardiovascular disease prevention, Asia, stroke, coronary heart disease, risk factors, hypertension, salt intake, smoking.</keywords>

<abstract>Cardiovascular disease (CVD) prevention in Asia is an important issue for world health, because half of the worlds population lives in Asia. Asian countries and regions such as Japan, the Republic of Korea, the Peoples Republic of China, Hong Kong, Taiwan, and th...

Generated Summary:
Half of the world population lives in Asia, and Asian countries and regions have greater mortality and morbidity from stroke than from coronary heart disease (CHD). This is true even in Western countries, such as Japan and South Korea. The Seven Countries Study found that Japanese populations had lower fat intake, lower serum total cholesterol, and lower CHD than populations in 