In [16]:
# !pip install faiss-cpu sentence-transformers transformers datasets rouge-score nltk kagglehub
# !python -m nltk.downloader punkt
# !pip install rouge --upgrade
# !pip install PyPDF2 --upgrade
# !pip install bert_score --upgrade

In [3]:
# !pip install chromadb --upgrade
# !pip install markdown

**LIBRARIES**

In [10]:
import kagglehub
import json
import os
import re
import numpy as np
from tqdm import tqdm
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from typing import List, Dict, Tuple
from datasets import load_dataset
import torch
from nltk.tokenize import sent_tokenize
import nltk
from collections import defaultdict
import random
import PyPDF2  # Added for PDF support
from bert_score import score  # Added for advanced evaluation
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
#HUGGING FACE TOKEN - SET
os.environ['HF_TOKEN'] = 'TOKEN ADDED'

**TASKS**

*   DOCUMENT INGETSION
*   EMBEDDING AND RETRIEVAL
*   SUMMARY GENERATION
*   OUPUT/ DISPLAY








In [15]:
class EnhancedDocumentSummarizer:
    # initialize with models + vector db
    def __init__(self, device: str = None):
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        print("Loading models...")  # loading required models
        self.embedding_model = SentenceTransformer('all-mpnet-base-v2', device=self.device)

        self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
        self.summarizer = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(self.device)

        # Vector database & storage initialized
        self.vector_db = None
        self.chunks = []
        self.documents = []
        print(" <<<  Models loaded successfully  >>>")

    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>
    # DOCUMENT LOADING
    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>

    def load_arxiv_data(self, max_docs: int = 1000) -> Dict:
        # Loadin the arxiv dataset from kaggle
        # & fallback - hugging face
        try:
            print("")
            print("<<<  Downloading arXiv dataset from Kaggle...  >>>")

            dataset_dir = kagglehub.dataset_download("Cornell-University/arxiv")
            json_file = os.path.join(dataset_dir, "arxiv-metadata-oai-snapshot.json")

            if not os.path.exists(json_file):
                raise FileNotFoundError(f"JSON file not found at {json_file}")

            print(f"==== Found arXiv dataset at: {json_file} ===")

            documents = []
            with open(json_file, 'r', encoding='utf-8') as f:
                for i, line in enumerate(tqdm(f, desc="Loading documents")):
                    if i >= max_docs:
                        break
                    try:
                        doc = json.loads(line)
                        processed = self._preprocess_document(doc)
                        if processed:
                            documents.append(processed)
                    except json.JSONDecodeError:
                        continue

            self.documents = documents
            print("")
            print(f"<<<  Loaded {len(self.documents)} documents after preprocessing  >>>")
            print("")
            return self.documents

        except Exception as e:
            # fall back on hugging face data base
            print(f"Error loading Kaggle dataset: {str(e)}")
            print("Falling back to HuggingFace dataset")
            try:
                dataset = load_dataset("Cornell-University/arxiv", split='train[:1000]')
                documents = []
                for doc in tqdm(dataset, desc="Loading fallback documents"):
                    processed = self._preprocess_document(doc)
                    if processed:
                        documents.append(processed)
                self.documents = documents
                return documents
            except Exception as fallback_error:
                print(f" !!!! Fallback failed: {str(fallback_error)}  !!!!!")
                return []

    # Loading Custom documnents --- PDF(s)/TXT
    def load_custom_document(self, filepath: str) -> Dict:
        try:
            text = ""
            # pdf using pyPDF...
            if filepath.endswith('.pdf'):
                with open(filepath, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        text += page.extract_text()
            elif filepath.endswith('.txt'):
                # Processing plain text
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
            else:
                raise ValueError("Unsupported file format")

            return {
                'id': os.path.basename(filepath),
                'title': os.path.basename(filepath),
                'abstract': text,
                'categories': 'custom',
                'authors': []
            }
        except Exception as e:
            print(f"!!!! Error loading custom document: {str(e)}  !!!!")
            return None

    # Cleaning and Filtering Documnet content
    def _preprocess_document(self, doc: Dict) -> Dict:
        try:
            title = doc.get('title', '').strip()
            abstract = doc.get('abstract', '').strip()

            # Remove LaTeX math
            abstract = re.sub(r'\$.*?\$', '', abstract)
            # Remove LaTeX commands
            abstract = re.sub(r'\\[a-zA-Z]+', '', abstract)

            if len(abstract.split()) < 50:  # min length filter
                return None

            return {
                'id': doc.get('id', ''),
                'title': title,
                'abstract': abstract,
                'categories': doc.get('categories', ''),
                'authors': doc.get('authors', [])
            }
        except Exception as e:
            print(f"!!!!  Error preprocessing document: {str(e)}  !!!!")
            return None

    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>
    # SEMANTIC CHUNKING
    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>

    # Enhanced with semantic boundary detection while maintaining original structure
    def chunk_document(self, text: str, chunk_size: int = 512, overlap: int = 64) -> List[str]:
        # split doc into meaning ful chinks .. semantically
        sentences = sent_tokenize(text)

        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sent_words = sentence.split()
            sent_length = len(sent_words)

            # Detect semantic boundaries (simple heuristic/Discourse markers)
            is_boundary = any(marker in sentence.lower() for marker in
                            ['however', 'in conclusion', 'furthermore', 'on the other hand'])

            if (current_length + sent_length > chunk_size and current_chunk) or is_boundary:
                chunks.append(' '.join(current_chunk))
                # maintain overlap between chunks
                overlap_words = int(overlap * 0.5)
                current_chunk = current_chunk[-overlap_words:] if overlap_words else []
                current_length = len(current_chunk)

            current_chunk.extend(sent_words)
            current_length += sent_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>
    # EMBEDDING + VECTOR STORAGE
    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>

    # Create FAISS vector database from document chunks
    def build_vector_db(self, documents: List[Dict]):
        # orig meta dat + chunks from db
        print("")
        print("<<<  Building enhanced vector database...   >>>")
        print("")

        self.chunks = []
        chunk_metadata = []

        # processing all docs into chunks
        for doc in tqdm(documents, desc="Processing documents"):
            text = f"Title: {doc['title']}\nAbstract: {doc['abstract']}"
            doc_chunks = self.chunk_document(text)

            # store it with the metadata
            for chunk in doc_chunks:
                self.chunks.append(chunk)
                chunk_metadata.append({
                    'doc_id': doc['id'],
                    'title': doc['title'],
                    'is_title': chunk.startswith("Title:"),
                    'category': doc.get('categories', '').split('.')[0]  # NEW: Added category info
                })

        # generate embeddinging in batches
        batch_size = 32
        embeddings = []

        for i in tqdm(range(0, len(self.chunks), batch_size), desc="Generating embeddings"):
            batch = self.chunks[i:i + batch_size]
            batch_embeddings = self.embedding_model.encode(
                batch,
                convert_to_tensor=True,
                show_progress_bar=False,
                device=self.device
            )
            embeddings.append(batch_embeddings.cpu().numpy())

        embeddings = np.concatenate(embeddings)

        # create FAISS Ind
        dimension = embeddings.shape[1]
        self.vector_db = faiss.IndexHNSWFlat(dimension, 32)
        self.vector_db.add(embeddings)

        self.chunk_metadata = chunk_metadata

        print(f"<<<  Vector DB created with {len(self.chunks)} chunks  >>>")

    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>
    # SEMANTIC RETRIEVAL
    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>

    # Retrieve top-k relevant chunks using semantic search
    def retrieve_relevant_chunks(self, query: str, k: int = 5, doc_id: str = None) -> List[Dict]:
        # Enhanced retrieval with category boosting
        if not self.vector_db:
            raise ValueError("!!!! Vector database not initialized !!!!")

        enhanced_query = f"Summarize this scientific paper about: {query}"
        # Enhanced query for better retrieval
        # get query embedding
        query_embedding = self.embedding_model.encode(
            [enhanced_query],
            convert_to_tensor=True,
            device=self.device
        ).cpu().numpy()

        # searching in vector db
        distances, indices = self.vector_db.search(query_embedding, k*2)

        # process results
        results = []
        seen_docs = set()

        for ix, dist in zip(indices[0], distances[0]):
            if ix < 0 or ix >= len(self.chunks):
                continue

            metadata = self.chunk_metadata[ix]
            if doc_id and metadata['doc_id'] != doc_id:
                continue

            # calculate scoring with boosts
            similarity = 1 / (1 + dist)

            # boost little chunks
            if metadata['is_title']:
                similarity *= 1.2

            # penalize duplicate docs
            if metadata['doc_id'] in seen_docs and len(seen_docs) > 1:
                similarity *= 0.9
            seen_docs.add(metadata['doc_id'])

            # bopst category matches
            if 'category' in metadata and metadata['category'] in query.lower():
                similarity *= 1.1

            results.append({
                'text': self.chunks[ix],
                'score': similarity,
                'is_title': metadata['is_title'],
                'doc_id': metadata['doc_id'],
                'doc_title': metadata['title'],
                'category': metadata.get('category', '')  # NEW: Added category info
            })

        results = sorted(results, key=lambda x: x['score'], reverse=True)[:k]
        # return top k results

        # normalize scores
        if results:
            max_score = results[0]['score']
            for r in results:
                r['score'] = r['score'] / max_score

        return results

    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>
    # SUMMARY GENERATION
    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>

    # Generate summary with RAG approach
    def generate_summary(self, document: Dict, query: str = None, focus_mode: str = "general") -> Dict:
        # Enhanced with focus modes and fallback
        # default queries for differenct focus modes
        print("")
        if not query:
            if focus_mode == "general":
                query = f"What are the key contributions and findings of this paper titled: {document['title']}?"
            elif focus_mode == "technical":
                query = "What are the technical methods and results?"
            elif focus_mode == "novelty":
                query = "What is novel about this work compared to prior research?"
            elif focus_mode == "applications":
                query = "What are the potential applications of this work?"  # NEW focus mode

        # retrieve relevant context
        retrieved = self.retrieve_relevant_chunks(query, k=5, doc_id=document['id'])

        # format context gen
        context_parts = []
        for i, r in enumerate(retrieved):
            prefix = "Title excerpt" if r['is_title'] else "Content"
            context_parts.append(f"{prefix} (Relevance: {r['score']:.2f}):\n{r['text']}\n")
        context = "\n".join(context_parts)

        # create input prompt
        input_text = (
            f"Summarize this document based on the following context:\n\n"
            f"Title: {document['title']}\n\n"
            f"Focus: {focus_mode.capitalize()}\n\n"  # Added focus mode
            f"Key Context:\n{context}\n\n"
            f"Full Abstract: {document['abstract']}\n\n"
            f"Summary:"
        )

        try:
            # generate summary with bart
            inputs = self.tokenizer(input_text, max_length=1024, truncation=True, return_tensors="pt").to(self.device)

            summary_ids = self.summarizer.generate(
                inputs.input_ids,
                num_beams=4,
                max_length=150,
                min_length=50,
                early_stopping=True,
                no_repeat_ngram_size=3
            )

            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

            # cal stats
            orig_length = len(document['abstract'].split())
            summary_length = len(summary.split())
            compression_ratio = orig_length / summary_length if summary_length else 0

            return {
                'document_id': document['id'],
                'title': document['title'],
                'original_length': orig_length,
                'summary_length': summary_length,
                'compression_ratio': f"{compression_ratio:.1f}x",
                'retrieved_context': retrieved,
                'summary': summary,
                'categories': document['categories'],
                'focus_mode': focus_mode  # NEW: Track focus mode
            }
        except Exception as e:
            print("")
            print(f"!!!!  Generation failed: {str(e)}  !!!!")
            # Fallback summary --- fallback to siple truncation
            return {
                'error': str(e),
                'fallback_summary': ' '.join(document['abstract'].split()[:100]) + "... [truncated]",
                'focus_mode': focus_mode
            }

    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>
    # EVALUATION + OUTPUT
    # <<<<<<<<<<<>>>>>>>>>>>>>>>>>>

    # Enhanced with BERTScore while ALSO keeping original ROUGE
    def evaluate_summary(self, generated_summary: str, reference_summary: str) -> Dict:
        from rouge import Rouge
        rouge = Rouge()

        try:
            # ROUGE evaluation - metrics
            rouge_scores = rouge.get_scores(generated_summary, reference_summary)[0]

            # BERTScore evaluation - metrics
            p, r, f1 = score([generated_summary], [reference_summary], lang='en')

            return {
                'rouge-1': rouge_scores['rouge-1']['f'],
                'rouge-2': rouge_scores['rouge-2']['f'],
                'rouge-l': rouge_scores['rouge-l']['f'],
                'bert_score_precision': p.mean().item(),
                'bert_score_recall': r.mean().item(),
                'bert_score_f1': f1.mean().item()
            }
        except Exception as e:
            print(f"!!!! Evaluation error: {str(e)} !!!!")
            return {'error': str(e)}


def main():
    summarizer = EnhancedDocumentSummarizer()

    # Load data u
    print("\n <<<  1. Loading arXiv data from Kaggle...  >>>")
    documents = summarizer.load_arxiv_data(max_docs=1000)

    # Add custom document example (uncomment to use)
    # custom_doc = summarizer.load_custom_document("example.pdf")
    # if custom_doc:
    #   documents.append(custom_doc)

    if not documents:
        print("")
        print("!!!! Failed to load documents  !!!!")
        return

    summarizer.build_vector_db(documents)

    # Group documents by primary category
    category_map = defaultdict(list)
    for doc in documents:
        primary_cat = doc['categories'].split('.')[0]
        category_map[primary_cat].append(doc)

    # Select 5 random categories with enough documents
    selected_categories = random.sample(
        [cat for cat in category_map if len(category_map[cat]) >= 3],
        min(5, len(category_map))
    )

    print("")
    print("\n Enhanced Summaries from 5 Different Categories:")
    print("="*100 + "\n")

    print("")
    for i, category in enumerate(selected_categories, 1):
        print(f"\n <<<<  CATEGORY {i}: {category.upper()}  >>>>")
        print("-"*50)

        doc = random.choice(category_map[category])

        # Demonstrate different focus modes
        for focus in ["general", "technical", "novelty", "applications"]:
            result = summarizer.generate_summary(doc, focus_mode=focus)

            if 'error' in result:
                print(f"!!!! Error generating {focus} summary: {result['error']} !!!!")
                continue

            print(f"\n <<<<  Focus Mode: {focus.upper()}  >>>>")
            print(f"\n <<<<   Title: {result['title']}  >>>>")
            print(f"\n <<<<   Original Abstract ({result['original_length']} words):  ")
            print(doc['abstract'][:300] + ("..." if len(doc['abstract']) > 300 else ""))
            print("")
            print(f"\n <<>> Generated Summary ({result['summary_length']} words, {result['compression_ratio']} compression):")
            print(result['summary'])

            ref_sentences = sent_tokenize(doc['abstract'])
            reference_summary = ' '.join(ref_sentences[:3])

            eval_results = summarizer.evaluate_summary(result['summary'], reference_summary)
            print(f"\n <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>")
            print(f"ROUGE-1: {eval_results.get('rouge-1', 'N/A'):.3f}")
            print(f"ROUGE-2: {eval_results.get('rouge-2', 'N/A'):.3f}")
            print(f"ROUGE-L: {eval_results.get('rouge-l', 'N/A'):.3f}")
            print(f"BERTScore F1: {eval_results.get('bert_score_f1', 'N/A'):.3f}")

            print("\n" + "-"*50)

        print("\n" + "="*100 + "\n")


if __name__ == "__main__":
    main()

Using device: cuda
Loading models...
 <<<  Models loaded successfully  >>>

 <<<  1. Loading arXiv data from Kaggle...  >>>

<<<  Downloading arXiv dataset from Kaggle...  >>>
Downloading from https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv?dataset_version_number=237...


100%|██████████| 1.45G/1.45G [00:20<00:00, 77.4MB/s]

Extracting files...





==== Found arXiv dataset at: /root/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/237/arxiv-metadata-oai-snapshot.json ===


Loading documents: 1000it [00:00, 34190.93it/s]



<<<  Loaded 889 documents after preprocessing  >>>


<<<  Building enhanced vector database...   >>>



Processing documents: 100%|██████████| 889/889 [00:00<00:00, 4681.54it/s]
Generating embeddings: 100%|██████████| 31/31 [00:22<00:00,  1.40it/s]


<<<  Vector DB created with 988 chunks  >>>


 Enhanced Summaries from 5 Different Categories:



 <<<<  CATEGORY 1: NUCL-TH  >>>>
--------------------------------------------------


 <<<<  Focus Mode: GENERAL  >>>>

 <<<<   Title: Two-proton radioactivity and three-body decay. IV. Connection to
  quasiclassical formulation  >>>>

 <<<<   Original Abstract (81 words):  
We derive quasiclassical expressions for the three-body decay width and
define the ``preexponential'' coefficients for them. The derivation is based on
the integral formulae for the three-body width obtained in the semianalytical
approach with simplified three-body Hamiltonian [L.V. Grigorenko and M...


 <<>> Generated Summary (48 words, 1.7x compression):
This article is about two-proton radioactivity and three-body decay. The model is applied to the decays of the first excited state of Ne and ground state of Fe. Various qualitative aspects of the model and relations with the other simplified approaches to the three-

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.229
ROUGE-2: 0.023
ROUGE-L: 0.229
BERTScore F1: 0.824

--------------------------------------------------


 <<<<  Focus Mode: TECHNICAL  >>>>

 <<<<   Title: Two-proton radioactivity and three-body decay. IV. Connection to
  quasiclassical formulation  >>>>

 <<<<   Original Abstract (81 words):  
We derive quasiclassical expressions for the three-body decay width and
define the ``preexponential'' coefficients for them. The derivation is based on
the integral formulae for the three-body width obtained in the semianalytical
approach with simplified three-body Hamiltonian [L.V. Grigorenko and M...


 <<>> Generated Summary (46 words, 1.8x compression):
This document is based on a quasiclassical formulation. The model is applied to the decays of the firstexcited  state of Ne and the ground state of Fe. The qualitative aspects of the model and relations with the other simpler approaches to the three-body decays are discus

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.269
ROUGE-2: 0.070
ROUGE-L: 0.239
BERTScore F1: 0.839

--------------------------------------------------


 <<<<  Focus Mode: NOVELTY  >>>>

 <<<<   Title: Two-proton radioactivity and three-body decay. IV. Connection to
  quasiclassical formulation  >>>>

 <<<<   Original Abstract (81 words):  
We derive quasiclassical expressions for the three-body decay width and
define the ``preexponential'' coefficients for them. The derivation is based on
the integral formulae for the three-body width obtained in the semianalytical
approach with simplified three-body Hamiltonian [L.V. Grigorenko and M...


 <<>> Generated Summary (51 words, 1.6x compression):
We derive quasiclassical expressions for the three-body decay width anddefine the coefficients for them. The model is applied to the decays of the first excited state of Ne and ground state of Fe. The qualitative aspects of the model and relations with the othersimplified a

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.423
ROUGE-2: 0.242
ROUGE-L: 0.394
BERTScore F1: 0.860

--------------------------------------------------


 <<<<  Focus Mode: APPLICATIONS  >>>>

 <<<<   Title: Two-proton radioactivity and three-body decay. IV. Connection to
  quasiclassical formulation  >>>>

 <<<<   Original Abstract (81 words):  
We derive quasiclassical expressions for the three-body decay width and
define the ``preexponential'' coefficients for them. The derivation is based on
the integral formulae for the three-body width obtained in the semianalytical
approach with simplified three-body Hamiltonian [L.V. Grigorenko and M...


 <<>> Generated Summary (52 words, 1.6x compression):
This document is based on the following context: Two-proton radioactivity and three-body decay. The model is applied to the decays of the firstexcited  state of Ne and the ground state of Fe. The derivation isbased on the integral formulae for the three- body width obt

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.480
ROUGE-2: 0.280
ROUGE-L: 0.480
BERTScore F1: 0.856

--------------------------------------------------



 <<<<  CATEGORY 2: HEP-TH ASTRO-PH GR-QC  >>>>
--------------------------------------------------


 <<<<  Focus Mode: GENERAL  >>>>

 <<<<   Title: Curvature and isocurvature perturbations in two-field inflation  >>>>

 <<<<   Original Abstract (87 words):  
We study cosmological perturbations in two-field inflation, allowing for
non-standard kinetic terms. We calculate analytically the spectra of curvature
and isocurvature modes at Hubble crossing, up to first order in the slow-roll
parameters. We also compute numerically the evolution of the curvature...


 <<>> Generated Summary (53 words, 1.6x compression):
We study cosmological perturbations in two-field inflation, allowing for non-standard kinetic terms. We calculate analytically the spectra of curvature and isocurvature modes at Hubble crossing, up to fi

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 1.000
ROUGE-2: 0.981
ROUGE-L: 1.000
BERTScore F1: 0.957

--------------------------------------------------


 <<<<  Focus Mode: TECHNICAL  >>>>

 <<<<   Title: Curvature and isocurvature perturbations in two-field inflation  >>>>

 <<<<   Original Abstract (87 words):  
We study cosmological perturbations in two-field inflation, allowing for
non-standard kinetic terms. We calculate analytically the spectra of curvature
and isocurvature modes at Hubble crossing, up to first order in the slow-roll
parameters. We also compute numerically the evolution of the curvature...


 <<>> Generated Summary (56 words, 1.6x compression):
Study cosmological perturbations in two-field inflation. We calculate analytically the spectra of curvature and isocurvature modes at Hubble crossing, up to first order in the slow-rollparameters. We show explicitly for a few examples, including the recently proposed model of `roulette' inflation, how

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.588
ROUGE-2: 0.467
ROUGE-L: 0.565
BERTScore F1: 0.913

--------------------------------------------------


 <<<<  Focus Mode: NOVELTY  >>>>

 <<<<   Title: Curvature and isocurvature perturbations in two-field inflation  >>>>

 <<<<   Original Abstract (87 words):  
We study cosmological perturbations in two-field inflation, allowing for
non-standard kinetic terms. We calculate analytically the spectra of curvature
and isocurvature modes at Hubble crossing, up to first order in the slow-roll
parameters. We also compute numerically the evolution of the curvature...


 <<>> Generated Summary (38 words, 2.3x compression):
Curvature and isocurvature perturbations in two-field inflation. Curvature changes between Hubble crossing and the end of inflation. The recentlyproposed model of `roulette' inflation. We show explicitly for a few examples, including the recently proposed model of 'roulette inflation'


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.338
ROUGE-2: 0.136
ROUGE-L: 0.338
BERTScore F1: 0.857

--------------------------------------------------


 <<<<  Focus Mode: APPLICATIONS  >>>>

 <<<<   Title: Curvature and isocurvature perturbations in two-field inflation  >>>>

 <<<<   Original Abstract (87 words):  
We study cosmological perturbations in two-field inflation, allowing for
non-standard kinetic terms. We calculate analytically the spectra of curvature
and isocurvature modes at Hubble crossing, up to first order in the slow-roll
parameters. We also compute numerically the evolution of the curvature...


 <<>> Generated Summary (48 words, 1.8x compression):
Curvature and isocurvature perturbations in two-field inflation. We calculate analytically the spectra of curvature.and isocURvature modes at Hubble crossing, up to first order in the slow-roll.parameters. We also compute numerically the evolution of the curvature and.isocurVature modes from well w

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.857
ROUGE-2: 0.812
ROUGE-L: 0.857
BERTScore F1: 0.932

--------------------------------------------------



 <<<<  CATEGORY 3: MATH-PH HEP-TH MATH  >>>>
--------------------------------------------------


 <<<<  Focus Mode: GENERAL  >>>>

 <<<<   Title: The Arctic Circle Revisited  >>>>

 <<<<   Original Abstract (126 words):  
The problem of limit shapes in the six-vertex model with domain wall boundary
conditions is addressed by considering a specially tailored bulk correlation
function, the emptiness formation probability. A closed expression of this
correlation function is given, both in terms of certain determinant an...


 <<>> Generated Summary (59 words, 2.1x compression):
The problem of limit shapes in the six-vertex model with domain wall boundaryconditions is addressed by considering a specially tailored bulk correlation function. The emptiness formation probability is related to a one-matrix model with a 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.632
ROUGE-2: 0.503
ROUGE-L: 0.596
BERTScore F1: 0.896

--------------------------------------------------


 <<<<  Focus Mode: TECHNICAL  >>>>

 <<<<   Title: The Arctic Circle Revisited  >>>>

 <<<<   Original Abstract (126 words):  
The problem of limit shapes in the six-vertex model with domain wall boundary
conditions is addressed by considering a specially tailored bulk correlation
function, the emptiness formation probability. A closed expression of this
correlation function is given, both in terms of certain determinant an...


 <<>> Generated Summary (59 words, 2.1x compression):
The problem of limit shapes in the six-vertex model with domain wall boundaryconditions is addressed by considering a specially tailored bulk correlation function. The emptiness formation probability is related to a one-matrix model with a triple logarithmic singularity, or Triple Penner model. The saddle-pointanalysis of this model le

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.632
ROUGE-2: 0.503
ROUGE-L: 0.596
BERTScore F1: 0.896

--------------------------------------------------


 <<<<  Focus Mode: NOVELTY  >>>>

 <<<<   Title: The Arctic Circle Revisited  >>>>

 <<<<   Original Abstract (126 words):  
The problem of limit shapes in the six-vertex model with domain wall boundary
conditions is addressed by considering a specially tailored bulk correlation
function, the emptiness formation probability. A closed expression of this
correlation function is given, both in terms of certain determinant an...


 <<>> Generated Summary (51 words, 2.5x compression):
The problem of limit shapes in the six-vertex model with domain wall boundary conditions is addressed by considering a specially tailored bulk correlation function. The emptiness formation probability is related to a one-matrix model with a triple logarithmic singularity. The saddle-point analysis of this model leads to the Arctic Circle

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.606
ROUGE-2: 0.503
ROUGE-L: 0.587
BERTScore F1: 0.898

--------------------------------------------------


 <<<<  Focus Mode: APPLICATIONS  >>>>

 <<<<   Title: The Arctic Circle Revisited  >>>>

 <<<<   Original Abstract (126 words):  
The problem of limit shapes in the six-vertex model with domain wall boundary
conditions is addressed by considering a specially tailored bulk correlation
function, the emptiness formation probability. A closed expression of this
correlation function is given, both in terms of certain determinant an...


 <<>> Generated Summary (59 words, 2.1x compression):
The problem of limit shapes in the six-vertex model with domain wall boundaryconditions is addressed by considering a specially tailored bulk correlation function. The emptiness formation probability is related to a one-matrix model with a triple logarithmic singularity, or Triple Penner model. The saddle-pointanalysis of this model

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.632
ROUGE-2: 0.503
ROUGE-L: 0.596
BERTScore F1: 0.896

--------------------------------------------------



 <<<<  CATEGORY 4: HEP-TH  >>>>
--------------------------------------------------


 <<<<  Focus Mode: GENERAL  >>>>

 <<<<   Title: The First Law for Boosted Kaluza-Klein Black Holes  >>>>

 <<<<   Original Abstract (145 words):  
We study the thermodynamics of Kaluza-Klein black holes with momentum along
the compact dimension, but vanishing angular momentum. These black holes are
stationary, but non-rotating. We derive the first law for these spacetimes and
find that the parameter conjugate to variations in the length of the...


 <<>> Generated Summary (21 words, 6.9x compression):
Title: The First Law for Boosted Kaluza-Klein Black Holes.Focus: General.Key Context: purposefully.Title excerpt (Relevance: 1.00): Title excerpt ( relevance): Title (relevance)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.059
ROUGE-2: 0.000
ROUGE-L: 0.059
BERTScore F1: 0.809

--------------------------------------------------


 <<<<  Focus Mode: TECHNICAL  >>>>

 <<<<   Title: The First Law for Boosted Kaluza-Klein Black Holes  >>>>

 <<<<   Original Abstract (145 words):  
We study the thermodynamics of Kaluza-Klein black holes with momentum along
the compact dimension, but vanishing angular momentum. These black holes are
stationary, but non-rotating. We derive the first law for these spacetimes and
find that the parameter conjugate to variations in the length of the...


 <<>> Generated Summary (44 words, 3.3x compression):
The First Law for Boosted Kaluza-Klein Black Holes is based on the Gibbs-Duhem relation. It applies to black holes with momentum along the compact dimension, but vanishing angular momentum. The effective tension is alwayspositive, while the ADM tension is negative for large boost parameters.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.439
ROUGE-2: 0.265
ROUGE-L: 0.439
BERTScore F1: 0.886

--------------------------------------------------


 <<<<  Focus Mode: NOVELTY  >>>>

 <<<<   Title: The First Law for Boosted Kaluza-Klein Black Holes  >>>>

 <<<<   Original Abstract (145 words):  
We study the thermodynamics of Kaluza-Klein black holes with momentum along
the compact dimension, but vanishing angular momentum. These black holes are
stationary, but non-rotating. We derive the first law for these spacetimes and
find that the parameter conjugate to variations in the length of the...


 <<>> Generated Summary (42 words, 3.5x compression):
The First Law for Boosted Kaluza-Klein Black Holes is a thermodynamic law. It applies to black holes with momentum along the compact dimension, but vanishing angular momentum. The effective tension is alwayspositive, while the ADM tension is negative for large boost parameters.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.469
ROUGE-2: 0.271
ROUGE-L: 0.444
BERTScore F1: 0.895

--------------------------------------------------


 <<<<  Focus Mode: APPLICATIONS  >>>>

 <<<<   Title: The First Law for Boosted Kaluza-Klein Black Holes  >>>>

 <<<<   Original Abstract (145 words):  
We study the thermodynamics of Kaluza-Klein black holes with momentum along
the compact dimension, but vanishing angular momentum. These black holes are
stationary, but non-rotating. We derive the first law for these spacetimes and
find that the parameter conjugate to variations in the length of the...


 <<>> Generated Summary (44 words, 3.3x compression):
The First Law for Boosted Kaluza-Klein Black Holes is based on the Gibbs-Duhem relation. It applies to black holes with momentum along the compact dimension, but vanishing angular momentum. The effective tension is alwayspositive, while the ADM tension is negative for large boost parameters.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.439
ROUGE-2: 0.265
ROUGE-L: 0.439
BERTScore F1: 0.886

--------------------------------------------------



 <<<<  CATEGORY 5: HEP-PH NUCL-TH  >>>>
--------------------------------------------------


 <<<<  Focus Mode: GENERAL  >>>>

 <<<<   Title: Direct photons and dileptons via color dipoles  >>>>

 <<<<   Original Abstract (72 words):  
Drell-Yan dilepton pair production and inclusive direct photon production can
be described within a unified framework in the color dipole approach. The
inclusion of non-perturbative primordial transverse momenta and DGLAP evolution
is studied. We successfully describe data for dilepton spectra from ...


 <<>> Generated Summary (44 words, 1.6x compression):
Drell-Yan dilepton pair production and inclusive direct photon production can be described within a unified framework in the color dipole approach. The inclusion of non-perturbative primordial transverse momenta and DGLAP evolu

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.863
ROUGE-2: 0.775
ROUGE-L: 0.863
BERTScore F1: 0.929

--------------------------------------------------


 <<<<  Focus Mode: TECHNICAL  >>>>

 <<<<   Title: Direct photons and dileptons via color dipoles  >>>>

 <<<<   Original Abstract (72 words):  
Drell-Yan dilepton pair production and inclusive direct photon production can
be described within a unified framework in the color dipole approach. The
inclusion of non-perturbative primordial transverse momenta and DGLAP evolution
is studied. We successfully describe data for dilepton spectra from ...


 <<>> Generated Summary (42 words, 1.7x compression):
Dilpion pair production and inclusive direct photon production can be described within a unified framework in the color dipole approach. The inclusion of non-perturbative primordial transverse momenta and DGLAP evolution is studied. We successfully describe data for dilepton spectra from 800-GeV ppcollisions.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.809
ROUGE-2: 0.716
ROUGE-L: 0.809
BERTScore F1: 0.919

--------------------------------------------------


 <<<<  Focus Mode: NOVELTY  >>>>

 <<<<   Title: Direct photons and dileptons via color dipoles  >>>>

 <<<<   Original Abstract (72 words):  
Drell-Yan dilepton pair production and inclusive direct photon production can
be described within a unified framework in the color dipole approach. The
inclusion of non-perturbative primordial transverse momenta and DGLAP evolution
is studied. We successfully describe data for dilepton spectra from ...


 <<>> Generated Summary (44 words, 1.6x compression):
The Drell-Yan dilepton pair production and inclusive direct photon production can be described within a unified framework in the color dipole approach. The inclusion of non-perturbative primordial transverse momenta and DGLAP evolution is studied. We successfully describe data for dilepton spectra from 800-GeV ppcollisi

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.830
ROUGE-2: 0.739
ROUGE-L: 0.830
BERTScore F1: 0.930

--------------------------------------------------


 <<<<  Focus Mode: APPLICATIONS  >>>>

 <<<<   Title: Direct photons and dileptons via color dipoles  >>>>

 <<<<   Original Abstract (72 words):  
Drell-Yan dilepton pair production and inclusive direct photon production can
be described within a unified framework in the color dipole approach. The
inclusion of non-perturbative primordial transverse momenta and DGLAP evolution
is studied. We successfully describe data for dilepton spectra from ...


 <<>> Generated Summary (44 words, 1.6x compression):
The Drell-Yan dilepton pair production and inclusive direct photon production can be described within a unified framework in the color dipole approach. The inclusion of non-perturbative primordial transverse momenta and DGLAP evolution is studied. We successfully describe data for dilepton spectra from 800-GeV ppco

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 <<<>>> Evaluation Scores (vs abstract excerpt): <<<>>>
ROUGE-1: 0.830
ROUGE-2: 0.739
ROUGE-L: 0.830
BERTScore F1: 0.930

--------------------------------------------------


