User enters topic
        ↓
API key is loaded
        ↓
Recent papers are searched
        ↓
Results are filtered by year
        ↓
Metadata is extracted
        ↓
Results saved as JSON
        ↓
Top papers displayed
        ↓
PDF downloaded and saved locally




In [None]:
# ============================================
#  Topic Input & Paper Search
# ============================================
#INSTALL REQUIRING LAIBRARIES

!pip install semanticscholar python-dotenv requests -q

import json
import os
from datetime import datetime
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

# ==========================
# 1. SETUP API KEY
# ==========================

def setup_api_key():
    """Loads Semantic Scholar API key from .env or creates file if missing."""

    load_dotenv()
    API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    # If .env not present → create one
    if not API_KEY:
        with open(".env", "w") as f:
            f.write("SEMANTIC_SCHOLAR_API_KEY=LIh1hqt2wg8fh3a1q4ooK2ltZS5lJePH5Ydb66ew\n")
        load_dotenv()
        API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    # Initialize Semantic Scholar client
    if API_KEY:
        sch = SemanticScholar(api_key=API_KEY)
        print(" Semantic Scholar initialized WITH API key.")
    else:
        sch = SemanticScholar()
        print(" Initialized WITHOUT API key (limited usage).")

    return sch



# ==========================
# 2. SEARCH RECENT PAPERS
# ==========================

def search_recent_papers(topic, years=2, limit=20):
    """
    Search for recent papers (last X years) on a topic.
    Returns structured dictionary of results.
    """

    print(f"\n Searching recent papers for topic: '{topic}'")
    print(f"   Limiting to last {years} years")

    sch = setup_api_key()
    current_year = datetime.now().year
    min_year = current_year - years

    try:
        # Search papers
        results = sch.search_paper(
            query=topic,
            limit=limit,
            fields=[
                "paperId", "title", "abstract", "year", "authors",
                "citationCount", "openAccessPdf", "url", "venue"
            ]
        )

        papers = []
        for paper in results:
            if paper.year and paper.year >= min_year:
                papers.append({
                    "title": paper.title,
                    "authors": [a["name"] for a in paper.authors] if paper.authors else [],
                    "year": paper.year,
                    "paperId": paper.paperId,
                    "abstract": (paper.abstract[:300] + "...")
                        if paper.abstract else "No abstract available",
                    "citationCount": paper.citationCount,
                    "venue": paper.venue if hasattr(paper, "venue") else None,
                    "url": paper.url,
                    "pdf_url": paper.openAccessPdf["url"] if paper.openAccessPdf else None,
                    "has_pdf": bool(paper.openAccessPdf)
                })

        pdf_count = sum(1 for p in papers if p["has_pdf"])

        print("\n Search complete!")
        print(f"   Total recent papers: {len(papers)}")
        print(f"   PDFs available: {pdf_count}")

        return {
            "topic": topic,
            "search_timestamp": datetime.now().isoformat(),
            "years_considered": years,
            "total_results": len(papers),
            "papers_with_pdf": pdf_count,
            "papers": papers
        }

    except Exception as e:
        print(f" ERROR: Could not fetch papers → {e}")
        return None



# ==========================
# 3. SAVE RESULTS AS JSON
# ==========================

def save_recent_results(data, filename=None):
    """Save recent paper search results into data/search_results folder."""

    if not filename:
        safe_topic = "".join(c for c in data["topic"] if c.isalnum() or c == " ").replace(" ", "_")
        filename = f"recent_papers_{safe_topic}.json"

    os.makedirs("data/search_results", exist_ok=True)
    filepath = os.path.join("data/search_results", filename)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f" Results saved to: {filepath}")
    return filepath



# ==========================
# 4. DISPLAY RECENT PAPERS
# ==========================

def display_recent_results(data, max_display=10):
    """Pretty print top recent papers."""

    print(f"\n--- Top {max_display} Recent Papers for '{data['topic']}' ---")

    for i, paper in enumerate(data["papers"][:max_display]):
        print(f"\n{i+1}. Title: {paper['title']}")
        print(f"   Authors: {', '.join(paper['authors'])}")
        print(f"   Year: {paper['year']}")
        print(f"   Citations: {paper['citationCount']}")
        print(f"   Venue: {paper['venue']}")
        print(f"   Abstract: {paper['abstract']}")
        print(f"   URL: {paper['url']}")
        print(f"   PDF Available: {paper['has_pdf']}")

In [None]:
def display_recent_results(data, max_display=10):
    """Pretty print top recent papers in clean format."""

    print(f"\n--- Top {max_display} Recent Papers for '{data['topic']}' ---")

    for i, paper in enumerate(data["papers"][:max_display]):
        print(f"\n{i+1}. Title: {paper['title']}")
        print(f"   Authors: {', '.join(paper['authors']) if paper['authors'] else 'Unknown'}")
        print(f"   Year: {paper['year']}")
        print(f"   Citations: {paper['citationCount']}")
        print(f"   Venue: {paper['venue'] if paper['venue'] else 'Not available'}")
        print(f"   Abstract: {paper['abstract']}")
        print(f"   URL: {paper['url']}")
        print(f"   PDF Available: {paper['has_pdf']}")


In [None]:
data = search_recent_papers("AI research paper summarization", years=3, limit=25)
display_recent_results(data)
save_recent_results(data)



 Searching recent papers for topic: 'AI research paper summarization'
   Limiting to last 3 years
 Semantic Scholar initialized WITH API key.

 Search complete!
   Total recent papers: 660
   PDFs available: 660

--- Top 10 Recent Papers for 'AI research paper summarization' ---

1. Title: Streamlining Academic Insights Gen AI in Research Paper Summarization
   Authors: Priyadharshini P, Sahanna B, Sangeetha G, Sivaranjani M
   Year: 2025
   Citations: 0
   Venue: Pertanika Proceedings
   Abstract: In today’s fast-paced academic environment, researchers face the challenge of processing vast amounts of information to produce comprehensive research papers. This project introduces a web-based application designed to automatically gather insights from multiple PDFs or text files and organize them ...
   URL: https://www.semanticscholar.org/paper/1251ff55c9d78c608fb567519b7c8eceeed1e282
   PDF Available: True

2. Title: Paper IQ - An Explainable AI Approach for Research Paper Summarization

'data/search_results/recent_papers_AI_research_paper_summarization.json'

In [None]:
import requests

def download_pdf(pdf_url, title, download_folder="downloaded_pdfs"):
    """Downloads a PDF from a given URL."""

    if not pdf_url:
        print("No PDF URL provided.")
        return None

    os.makedirs(download_folder, exist_ok=True)
    safe_title = "".join(c for c in title if c.isalnum() or c == " ").replace(" ", "_")
    filepath = os.path.join(download_folder, f"{safe_title}.pdf")

    print(f"Downloading '{title}' from {pdf_url}...")
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)

        with open(filepath, 'wb') as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)
        print(f"Successfully downloaded to: {filepath}")
        return filepath
    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
        return None

def user_download_one_pdf(data):
    """Allows the user to select and download one PDF from the search results."""

    papers_with_pdf = [p for p in data["papers"] if p["has_pdf"]]

    if not papers_with_pdf:
        print("No papers with downloadable PDFs found in the current results.")
        return

    print("\n--- Papers with Available PDFs ---")
    for i, paper in enumerate(papers_with_pdf):
        print(f"{i+1}. {paper['title']} (Year: {paper['year']})")

    while True:
        try:
            choice = input("Enter the number of the PDF to download (or 'q' to quit): ")
            if choice.lower() == 'q':
                print("Exiting PDF download.")
                return

            idx = int(choice) - 1
            if 0 <= idx < len(papers_with_pdf):
                selected_paper = papers_with_pdf[idx]
                download_pdf(selected_paper["pdf_url"], selected_paper["title"])
                return
            else:
                print("Invalid number. Please try again.")
        except ValueError:
            print("Invalid input. Please enter a number or 'q'.")


In [None]:
data = search_recent_papers("AI research paper summarization", years=3, limit=25)
display_recent_results(data)
user_download_one_pdf(data)


 Searching recent papers for topic: 'AI research paper summarization'
   Limiting to last 3 years
 Semantic Scholar initialized WITH API key.

 Search complete!
   Total recent papers: 661
   PDFs available: 661

--- Top 10 Recent Papers for 'AI research paper summarization' ---

1. Title: Streamlining Academic Insights Gen AI in Research Paper Summarization
   Authors: Priyadharshini P, Sahanna B, Sangeetha G, Sivaranjani M
   Year: 2025
   Citations: 0
   Venue: Pertanika Proceedings
   Abstract: In today’s fast-paced academic environment, researchers face the challenge of processing vast amounts of information to produce comprehensive research papers. This project introduces a web-based application designed to automatically gather insights from multiple PDFs or text files and organize them ...
   URL: https://www.semanticscholar.org/paper/1251ff55c9d78c608fb567519b7c8eceeed1e282
   PDF Available: True

2. Title: Paper IQ - An Explainable AI Approach for Research Paper Summarization

implementation of the text extraction module for parsing downloaded research paper PDFs

## Install Libraries and Setup Directories

In [None]:
import os

# 1. Install necessary Python libraries
!pip install PyMuPDF sentence-transformers -q

# 2. Create the required directory structure
dirs_to_create = [
    "data/pdfs",
    "data/extracted_text",
    "data/structured_sections",
    "data/comparisons"
]

for d in dirs_to_create:
    os.makedirs(d, exist_ok=True)
    print(f"Ensured directory exists: {d}")

Ensured directory exists: data/pdfs
Ensured directory exists: data/extracted_text
Ensured directory exists: data/structured_sections
Ensured directory exists: data/comparisons


## Extract Raw Text from PDFs

In [None]:
import fitz  # PyMuPDF
import os

def extract_text_from_pdf(pdf_path, output_dir):
    """
    Extracts raw text content from a PDF file, sanitizes it, and saves it as a .txt file.
    """
    try:
        document = fitz.open(pdf_path)
        full_text = []
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            full_text.append(page.get_text())
        document.close()

        # Sanitize the extracted text
        cleaned_text = "\n".join(full_text)
        cleaned_text = os.linesep.join([s for s in cleaned_text.splitlines() if s.strip()]) # Remove empty lines
        cleaned_text = cleaned_text.strip()

        # Construct output filename
        pdf_filename = os.path.basename(pdf_path)
        txt_filename = os.path.splitext(pdf_filename)[0] + ".txt"
        output_filepath = os.path.join(output_dir, txt_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            f.write(cleaned_text)
        print(f"Extracted text from '{pdf_filename}' to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error extracting text from '{pdf_path}': {e}")
        return None

# Define input and output directories
input_pdf_dir = "data/pdfs"
output_text_dir = "data/extracted_text"

# Ensure output directory exists (already done in previous step, but good practice)
os.makedirs(output_text_dir, exist_ok=True)

# Iterate through all .pdf files in the input directory
pdf_files = [f for f in os.listdir(input_pdf_dir) if f.lower().endswith('.pdf')]

if not pdf_files:
    print(f"No PDF files found in '{input_pdf_dir}'. Please ensure PDFs are present.")
else:
    print(f"Found {len(pdf_files)} PDF files. Starting text extraction...")
    for pdf_file in pdf_files:
        full_pdf_path = os.path.join(input_pdf_dir, pdf_file)
        extract_text_from_pdf(full_pdf_path, output_text_dir)

print("Text extraction process complete.")

Found 2 PDF files. Starting text extraction...
Extracted text from 'Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.pdf' to 'data/extracted_text/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.txt'
Extracted text from 'ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.pdf' to 'data/extracted_text/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.txt'
Text extraction process complete.


## Clean and Structure Extracted Text


In [None]:
import os
import re
import json

def structure_text_content(text_filepath, output_dir):
    """
    Reads a raw text file, extracts specific sections (Abstract, Introduction, Conclusion),
    cleans them, and saves the structured content as a JSON file.
    """
    try:
        with open(text_filepath, "r", encoding="utf-8") as f:
            full_text = f.read()

        structured_data = {
            "full_text_path": text_filepath,
            "abstract": "",
            "introduction": "",
            "conclusion": ""
        }

        # Regex patterns for common sections (case-insensitive and handling variations)
        # Abstract: usually at the beginning
        abstract_match = re.search(r"(?i)abstract\n\s*(.*?)(?=\n\s*1\s*introduction|\n\s*i\s*introduction|\n\s*keywords|\n\s*[a-z])", full_text, re.DOTALL)
        if abstract_match:
            structured_data["abstract"] = re.sub(r'\s+', ' ', abstract_match.group(1).strip())

        # Introduction: often numbered or clearly headed
        introduction_match = re.search(r"(?i)(?:1\s*|i\s*|)\s*introduction\n\s*(.*?)(?=\n\s*(?:2\s*|ii\s*|)\s*(?:related work|background|methodology)|\n\s*(?:conclusion|abstract|references)|\n\s*[a-z])", full_text, re.DOTALL)
        if introduction_match:
            structured_data["introduction"] = re.sub(r'\s+', ' ', introduction_match.group(1).strip())

        # Conclusion: usually at the end, often numbered
        conclusion_match = re.search(r"(?i)(?:\n\s*(?:\d+\s*|x*i*v*\s*|)\s*conclusion(?:s?)|\n\s*discussion and conclusion(?:s?))\n\s*(.*?)(?=\n\s*(?:acknowledgements|references|appendix|copyright|further work)|$)", full_text, re.DOTALL)
        if conclusion_match:
            structured_data["conclusion"] = re.sub(r'\s+', ' ', conclusion_match.group(1).strip())

        # Construct output filename
        text_filename = os.path.basename(text_filepath)
        json_filename = os.path.splitext(text_filename)[0] + ".json"
        output_filepath = os.path.join(output_dir, json_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=4, ensure_ascii=False)
        print(f"Structured content from '{text_filename}' saved to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error structuring content from '{text_filepath}': {e}")
        return None

# Define input and output directories
input_text_dir = "data/extracted_text"
output_structured_dir = "data/structured_sections"

# Ensure output directory exists
os.makedirs(output_structured_dir, exist_ok=True)

# Iterate through all .txt files in the input directory
txt_files = [f for f in os.listdir(input_text_dir) if f.lower().endswith('.txt')]

if not txt_files:
    print(f"No text files found in '{input_text_dir}'. Please ensure text files are present.")
else:
    print(f"Found {len(txt_files)} text files. Starting content structuring...")
    for txt_file in txt_files:
        full_text_path = os.path.join(input_text_dir, txt_file)
        structure_text_content(full_text_path, output_structured_dir)

print("Content structuring process complete.")


Found 4 text files. Starting content structuring...
Structured content from 'ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.txt' saved to 'data/structured_sections/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.json'
Structured content from 'Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.txt' saved to 'data/structured_sections/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.json'
Structured content from 'Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.txt' saved to 'data/structured_sections/Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.json'
Structured content from 'Accuracy_and_Bias_Mitigation_in_GenAI__LLMbased_Financial_Un

In [None]:
import os
import json
import re
import nltk

# Ensure nltk punkt tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)

def extract_key_findings(structured_data_filepath, output_dir, num_sentences=2):
    """
    Loads structured content, extracts key findings from 'abstract', 'introduction', and 'conclusion',
    and saves the updated content as a JSON file.
    """
    try:
        with open(structured_data_filepath, "r", encoding="utf-8") as f:
            structured_data = json.load(f)

        key_findings = {}

        # Helper function to extract N sentences
        def get_key_sentences(text, n):
            if not text or len(text.strip()) < 10: # Handle empty or very short sections
                return "No key findings available for this section."
            sentences = nltk.sent_tokenize(text)
            return " ".join(sentences[:n]).strip()

        # Extract key findings from each section
        key_findings["abstract"] = get_key_sentences(structured_data.get("abstract", ""), num_sentences)
        # For introduction, sometimes the last sentences are more indicative of the paper's contribution
        intro_text = structured_data.get("introduction", "")
        if intro_text and len(nltk.sent_tokenize(intro_text)) > num_sentences:
            sentences = nltk.sent_tokenize(intro_text)
            key_findings["introduction"] = " ".join(sentences[-num_sentences:]).strip()
        else:
            key_findings["introduction"] = get_key_sentences(intro_text, num_sentences)

        key_findings["conclusion"] = get_key_sentences(structured_data.get("conclusion", ""), num_sentences)

        structured_data["key_findings"] = key_findings

        # Construct output filename (overwrite the original structured JSON)
        json_filename = os.path.basename(structured_data_filepath)
        output_filepath = os.path.join(output_dir, json_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=4, ensure_ascii=False)
        print(f"Extracted key findings and saved to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error extracting key findings from '{structured_data_filepath}': {e}")
        return None

# Define input and output directories
input_structured_dir = "data/structured_sections"
output_key_findings_dir = "data/structured_sections" # Overwriting original structured JSONs

# Iterate through all .json files in the input directory
json_files = [f for f in os.listdir(input_structured_dir) if f.lower().endswith('.json')]

if not json_files:
    print(f"No JSON files found in '{input_structured_dir}'. Please ensure structured JSONs are present.")
else:
    print(f"Found {len(json_files)} structured JSON files. Starting key findings extraction...")
    for json_file in json_files:
        full_json_path = os.path.join(input_structured_dir, json_file)
        extract_key_findings(full_json_path, output_key_findings_dir)

print("Key findings extraction process complete.")

Found 4 structured JSON files. Starting key findings extraction...
Extracted key findings and saved to 'data/structured_sections/Accuracy_and_Bias_Mitigation_in_GenAI__LLMbased_Financial_Underwriting_and_Clinical_Summarization_Systems.json'
Extracted key findings and saved to 'data/structured_sections/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.json'
Extracted key findings and saved to 'data/structured_sections/Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.json'
Extracted key findings and saved to 'data/structured_sections/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.json'
Key findings extraction process complete.


In [None]:
import os
import json
import re
import nltk

# Ensure nltk punkt tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Also explicitly download 'punkt_tab' if it's the one causing issues, as suggested by the error
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', quiet=True)

def extract_key_findings(structured_data_filepath, output_dir, num_sentences=2):
    """
    Loads structured content, extracts key findings from 'abstract', 'introduction', and 'conclusion',
    and saves the updated content as a JSON file.
    """
    try:
        with open(structured_data_filepath, "r", encoding="utf-8") as f:
            structured_data = json.load(f)

        key_findings = {}

        # Helper function to extract N sentences
        def get_key_sentences(text, n):
            if not text or len(text.strip()) < 10: # Handle empty or very short sections
                return "No key findings available for this section."
            sentences = nltk.sent_tokenize(text)
            return " ".join(sentences[:n]).strip()

        # Extract key findings from each section
        key_findings["abstract"] = get_key_sentences(structured_data.get("abstract", ""), num_sentences)
        # For introduction, sometimes the last sentences are more indicative of the paper's contribution
        intro_text = structured_data.get("introduction", "")
        if intro_text and len(nltk.sent_tokenize(intro_text)) > num_sentences:
            sentences = nltk.sent_tokenize(intro_text)
            key_findings["introduction"] = " ".join(sentences[-num_sentences:]).strip()
        else:
            key_findings["introduction"] = get_key_sentences(intro_text, num_sentences)

        key_findings["conclusion"] = get_key_sentences(structured_data.get("conclusion", ""), num_sentences)

        structured_data["key_findings"] = key_findings

        # Construct output filename (overwrite the original structured JSON)
        json_filename = os.path.basename(structured_data_filepath)
        output_filepath = os.path.join(output_dir, json_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=4, ensure_ascii=False)
        print(f"Extracted key findings and saved to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error extracting key findings from '{structured_data_filepath}': {e}")
        return None

# Define input and output directories
input_structured_dir = "data/structured_sections"
output_key_findings_dir = "data/structured_sections" # Overwriting original structured JSONs

# Iterate through all .json files in the input directory
json_files = [f for f in os.listdir(input_structured_dir) if f.lower().endswith('.json')]

if not json_files:
    print(f"No JSON files found in '{input_structured_dir}'. Please ensure structured JSONs are present.")
else:
    print(f"Found {len(json_files)} structured JSON files. Starting key findings extraction...")
    for json_file in json_files:
        full_json_path = os.path.join(input_structured_dir, json_file)
        extract_key_findings(full_json_path, output_key_findings_dir)

print("Key findings extraction process complete.")

Found 4 structured JSON files. Starting key findings extraction...
Extracted key findings and saved to 'data/structured_sections/Accuracy_and_Bias_Mitigation_in_GenAI__LLMbased_Financial_Underwriting_and_Clinical_Summarization_Systems.json'
Extracted key findings and saved to 'data/structured_sections/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.json'
Extracted key findings and saved to 'data/structured_sections/Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.json'
Extracted key findings and saved to 'data/structured_sections/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.json'
Key findings extraction process complete.


## Compare Key Findings Across Papers

In [None]:
import os
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load Sentence Transformer Model
print("Loading Sentence Transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence Transformer model loaded.")

# 2. Load All Key Findings
input_structured_dir = "data/structured_sections"

papers_data = []
json_files = [f for f in os.listdir(input_structured_dir) if f.lower().endswith('.json')]

if not json_files:
    print(f"No JSON files found in '{input_structured_dir}'. Cannot perform comparison.")
else:
    print(f"Found {len(json_files)} structured JSON files for comparison.")
    for json_file in json_files:
        full_json_path = os.path.join(input_structured_dir, json_file)
        try:
            with open(full_json_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            title = os.path.splitext(json_file)[0].replace("_", " ") # Derive title from filename
            key_findings_sections = data.get("key_findings", {})

            combined_key_findings = []
            if key_findings_sections.get("abstract"): combined_key_findings.append(key_findings_sections["abstract"])
            if key_findings_sections.get("introduction"): combined_key_findings.append(key_findings_sections["introduction"])
            if key_findings_sections.get("conclusion"): combined_key_findings.append(key_findings_sections["conclusion"])

            full_key_findings_text = " ".join(combined_key_findings).strip()

            if full_key_findings_text:
                papers_data.append({
                    "title": title,
                    "text": full_key_findings_text,
                    "filepath": full_json_path
                })
            else:
                print(f"Warning: No key findings found for '{title}', skipping.")
        except Exception as e:
            print(f"Error loading or processing '{json_file}': {e}")

    if not papers_data:
        print("No papers with valid key findings to compare.")
    else:
        # 3. Generate Embeddings
        print(f"Generating embeddings for {len(papers_data)} papers...")
        corpus_sentences = [p["text"] for p in papers_data]
        corpus_embeddings = model.encode(corpus_sentences, convert_to_tensor=True)
        print("Embeddings generated.")

        # 4. Calculate Similarity
        print("Calculating cosine similarity...")
        cosine_scores = cosine_similarity(corpus_embeddings)
        print("Cosine similarity calculated.")

        # 5. Structure Comparison Results
        comparison_results = []
        for i in range(len(papers_data)):
            for j in range(i + 1, len(papers_data)): # Avoid self-comparison and duplicate pairs
                similarity = cosine_scores[i][j]
                comparison_results.append({
                    "paper1_title": papers_data[i]["title"],
                    "paper2_title": papers_data[j]["title"],
                    "similarity_score": float(similarity) # Convert to float for JSON serialization
                })

        # Sort by similarity score in descending order
        comparison_results_sorted = sorted(comparison_results, key=lambda x: x["similarity_score"], reverse=True)

        # 6. Save Comparison Results
        output_comparisons_dir = "data/comparisons"
        os.makedirs(output_comparisons_dir, exist_ok=True)
        output_filepath = os.path.join(output_comparisons_dir, "comparison_results.json")

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(comparison_results_sorted, f, indent=4, ensure_ascii=False)
        print(f"Comparison results saved to: {output_filepath}")

        # 7. Provide a Summary/Example
        print("\n--- Top 5 Most Similar Paper Pairs ---")
        for i, result in enumerate(comparison_results_sorted[:5]):
            print(f"{i+1}. \"" + result["paper1_title"] + "\" vs \"" + result["paper2_title"] + f"\" (Similarity: {result["similarity_score"]:.4f})")


Loading Sentence Transformer model...
Sentence Transformer model loaded.
Found 4 structured JSON files for comparison.
Generating embeddings for 4 papers...
Embeddings generated.
Calculating cosine similarity...
Cosine similarity calculated.
Comparison results saved to: data/comparisons/comparison_results.json

--- Top 5 Most Similar Paper Pairs ---
1. "Accuracy and Bias Mitigation in GenAI  LLMbased Financial Underwriting and Clinical Summarization Systems" vs "Revolutionizing Content Digestion Unleashing the Power of Bidirectional and AutoRegressive Transformers in AIPowered Automatic Text Summarization" (Similarity: 1.0000)
2. "ChartSumm A Comprehensive Benchmark for Automatic Chart Summarization of Long and Short Summaries" vs "Beyond Summarization Designing AI Support for RealWorld Expository Writing Tasks" (Similarity: 0.2274)
3. "Accuracy and Bias Mitigation in GenAI  LLMbased Financial Underwriting and Clinical Summarization Systems" vs "ChartSumm A Comprehensive Benchmark for 

In [None]:
import os
import re
import json

def structure_text_content(text_filepath, output_dir):
    """
    Reads a raw text file, extracts specific sections (Abstract, Introduction, Conclusion),
    cleans them, and saves the structured content as a JSON file.
    """
    try:
        with open(text_filepath, "r", encoding="utf-8") as f:
            full_text = f.read()

        structured_data = {
            "full_text_path": text_filepath,
            "abstract": "",
            "introduction": "",
            "conclusion": ""
        }

        # Regex patterns for common sections (case-insensitive and handling variations)
        # Abstract: usually at the beginning, often separated by newlines
        abstract_match = re.search(
            r"(?i)abstract\s*(?:\n|\r|\r\n)(.*?)(?=\n\s*(?:(?:1|i)\s*introduction|keywords|i\s*introduction|ii\s*related work|1\s*introduction|section|introduction:)|$)",
            full_text, re.DOTALL
        )
        if abstract_match:
            structured_data["abstract"] = re.sub(r'\s+', ' ', abstract_match.group(1).strip())

        # Introduction: often numbered or clearly headed, capture until next major section
        introduction_match = re.search(
            r"(?i)(?:^|\n)\s*(?:(?:1|i)\s*|)\s*introduction(?:\s*\:|)\s*(?:\n|\r|\r\n)(.*?)(?=\n\s*(?:(?:2|ii)\s*|\d+\s*\.)(?:related work|background|methodology|literature review|section|materials and methods|experiment|proposed method)|(?:\n\s*conclusion(?:s?))|(?=\n\s*(?:acknowledgements|references|appendix))|$)",
            full_text, re.DOTALL
        )
        if introduction_match:
            structured_data["introduction"] = re.sub(r'\s+', ' ', introduction_match.group(1).strip())

        # Conclusion: usually at the end, often numbered, capture until acknowledgements/references or end
        conclusion_match = re.search(
            r"(?i)(?:^|\n)\s*(?:\d+\s*|x*i*v*\s*|)\s*conclusion(?:s?)(?:\s+and\s+discussion)?\s*(?:\n|\r|\r\n)(.*?)(?=\n\s*(?:acknowledgements|references|appendix|copyright|further work)|$)",
            full_text, re.DOTALL
        )
        if conclusion_match:
            structured_data["conclusion"] = re.sub(r'\s+', ' ', conclusion_match.group(1).strip())

        # Construct output filename
        text_filename = os.path.basename(text_filepath)
        json_filename = os.path.splitext(text_filename)[0] + ".json"
        output_filepath = os.path.join(output_dir, json_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=4, ensure_ascii=False)
        print(f"Structured content from '{text_filename}' saved to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error structuring content from '{text_filepath}': {e}")
        return None

# Define input and output directories
input_text_dir = "data/extracted_text"
output_structured_dir = "data/structured_sections"

# Ensure output directory exists (already done in previous step, but good practice)
os.makedirs(output_structured_dir, exist_ok=True)

# Iterate through all .txt files in the input directory
txt_files = [f for f in os.listdir(input_text_dir) if f.lower().endswith('.txt')]

if not txt_files:
    print(f"No text files found in '{input_text_dir}'. Please ensure text files are present.")
else:
    print(f"Found {len(txt_files)} text files. Starting content structuring...")
    for txt_file in txt_files:
        full_text_path = os.path.join(input_text_dir, txt_file)
        structure_text_content(full_text_path, output_structured_dir)

print("Content structuring process complete.")


Found 4 text files. Starting content structuring...
Structured content from 'ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.txt' saved to 'data/structured_sections/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.json'
Structured content from 'Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.txt' saved to 'data/structured_sections/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.json'
Structured content from 'Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.txt' saved to 'data/structured_sections/Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.json'
Structured content from 'Accuracy_and_Bias_Mitigation_in_GenAI__LLMbased_Financial_Un

In [None]:
import os
import json
import re
import nltk

# Ensure nltk punkt tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Also explicitly download 'punkt_tab' if it's the one causing issues, as suggested by the error
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', quiet=True)

def extract_key_findings(structured_data_filepath, output_dir, num_sentences=2):
    """
    Loads structured content, extracts key findings from 'abstract', 'introduction', and 'conclusion',
    and saves the updated content as a JSON file.
    """
    try:
        with open(structured_data_filepath, "r", encoding="utf-8") as f:
            structured_data = json.load(f)

        key_findings = {}

        # Helper function to extract N sentences
        def get_key_sentences(text, n):
            if not text or len(text.strip()) < 10: # Handle empty or very short sections
                return "No key findings available for this section."
            sentences = nltk.sent_tokenize(text)
            return " ".join(sentences[:n]).strip()

        # Extract key findings from each section
        key_findings["abstract"] = get_key_sentences(structured_data.get("abstract", ""), num_sentences)
        # For introduction, sometimes the last sentences are more indicative of the paper's contribution
        intro_text = structured_data.get("introduction", "")
        if intro_text and len(nltk.sent_tokenize(intro_text)) > num_sentences:
            sentences = nltk.sent_tokenize(intro_text)
            key_findings["introduction"] = " ".join(sentences[-num_sentences:]).strip()
        else:
            key_findings["introduction"] = get_key_sentences(intro_text, num_sentences)

        key_findings["conclusion"] = get_key_sentences(structured_data.get("conclusion", ""), num_sentences)

        structured_data["key_findings"] = key_findings

        # Construct output filename (overwrite the original structured JSON)
        json_filename = os.path.basename(structured_data_filepath)
        output_filepath = os.path.join(output_dir, json_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=4, ensure_ascii=False)
        print(f"Extracted key findings and saved to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error extracting key findings from '{structured_data_filepath}': {e}")
        return None

# Define input and output directories
input_structured_dir = "data/structured_sections"
output_key_findings_dir = "data/structured_sections" # Overwriting original structured JSONs

# Iterate through all .json files in the input directory
json_files = [f for f in os.listdir(input_structured_dir) if f.lower().endswith('.json')]

if not json_files:
    print(f"No JSON files found in '{input_structured_dir}'. Please ensure structured JSONs are present.")
else:
    print(f"Found {len(json_files)} structured JSON files. Starting key findings extraction...")
    for json_file in json_files:
        full_json_path = os.path.join(input_structured_dir, json_file)
        extract_key_findings(full_json_path, output_key_findings_dir)

print("Key findings extraction process complete.")


Found 4 structured JSON files. Starting key findings extraction...
Extracted key findings and saved to 'data/structured_sections/Accuracy_and_Bias_Mitigation_in_GenAI__LLMbased_Financial_Underwriting_and_Clinical_Summarization_Systems.json'
Extracted key findings and saved to 'data/structured_sections/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.json'
Extracted key findings and saved to 'data/structured_sections/Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.json'
Extracted key findings and saved to 'data/structured_sections/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.json'
Key findings extraction process complete.


In [None]:
import os
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load Sentence Transformer Model
print("Loading Sentence Transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence Transformer model loaded.")

# 2. Load All Key Findings
input_structured_dir = "data/structured_sections"

papers_data = []
json_files = [f for f in os.listdir(input_structured_dir) if f.lower().endswith('.json')]

if not json_files:
    print(f"No JSON files found in '{input_structured_dir}'. Cannot perform comparison.")
else:
    print(f"Found {len(json_files)} structured JSON files for comparison.")
    for json_file in json_files:
        full_json_path = os.path.join(input_structured_dir, json_file)
        try:
            with open(full_json_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            title = os.path.splitext(json_file)[0].replace("_", " ") # Derive title from filename
            key_findings_sections = data.get("key_findings", {})

            combined_key_findings = []
            if key_findings_sections.get("abstract"): combined_key_findings.append(key_findings_sections["abstract"])
            if key_findings_sections.get("introduction"): combined_key_findings.append(key_findings_sections["introduction"])
            if key_findings_sections.get("conclusion"): combined_key_findings.append(key_findings_sections["conclusion"])

            full_key_findings_text = " ".join(combined_key_findings).strip()

            if full_key_findings_text:
                papers_data.append({
                    "title": title,
                    "text": full_key_findings_text,
                    "filepath": full_json_path
                })
            else:
                print(f"Warning: No key findings found for '{title}', skipping.")
        except Exception as e:
            print(f"Error loading or processing '{json_file}': {e}")

    if not papers_data:
        print("No papers with valid key findings to compare.")
    else:
        # 3. Generate Embeddings
        print(f"Generating embeddings for {len(papers_data)} papers...")
        corpus_sentences = [p["text"] for p in papers_data]
        corpus_embeddings = model.encode(corpus_sentences, convert_to_tensor=True)
        print("Embeddings generated.")

        # 4. Calculate Similarity
        print("Calculating cosine similarity...")
        cosine_scores = cosine_similarity(corpus_embeddings)
        print("Cosine similarity calculated.")

        # 5. Structure Comparison Results
        comparison_results = []
        for i in range(len(papers_data)):
            for j in range(i + 1, len(papers_data)): # Avoid self-comparison and duplicate pairs
                similarity = cosine_scores[i][j]
                comparison_results.append({
                    "paper1_title": papers_data[i]["title"],
                    "paper2_title": papers_data[j]["title"],
                    "similarity_score": float(similarity) # Convert to float for JSON serialization
                })

        # Sort by similarity score in descending order
        comparison_results_sorted = sorted(comparison_results, key=lambda x: x["similarity_score"], reverse=True)

        # 6. Save Comparison Results
        output_comparisons_dir = "data/comparisons"
        os.makedirs(output_comparisons_dir, exist_ok=True)
        output_filepath = os.path.join(output_comparisons_dir, "comparison_results.json")

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(comparison_results_sorted, f, indent=4, ensure_ascii=False)
        print(f"Comparison results saved to: {output_filepath}")

        # 7. Provide a Summary/Example
        print("\n--- Top 5 Most Similar Paper Pairs ---")
        for i, result in enumerate(comparison_results_sorted[:5]):
            print(f"{i+1}. \"" + result["paper1_title"] + "\" vs \"" + result["paper2_title"] + f"\" (Similarity: {result["similarity_score"]:.4f})")

Loading Sentence Transformer model...
Sentence Transformer model loaded.
Found 4 structured JSON files for comparison.
No papers with valid key findings to compare.


In [None]:
import os
import re
import json

def structure_text_content(text_filepath, output_dir):
    """
    Reads a raw text file, extracts specific sections (Abstract, Introduction, Conclusion),
    cleans them, and saves the structured content as a JSON file.
    """
    try:
        with open(text_filepath, "r", encoding="utf-8") as f:
            full_text = f.read()

        structured_data = {
            "full_text_path": text_filepath,
            "abstract": "",
            "introduction": "",
            "conclusion": ""
        }

        # Regex patterns for common sections (case-insensitive and handling variations)
        # Abstract: usually at the beginning, often separated by newlines
        abstract_match = re.search(
            r"(?i)abstract\s*(?:\n|\r|\r\n)(.*?)(?=\n\s*(?:(?:1|i)\s*introduction|keywords|i\s*introduction|ii\s*related work|1\s*introduction|section|introduction:)|$)",
            full_text, re.DOTALL
        )
        if abstract_match:
            structured_data["abstract"] = re.sub(r'\s+', ' ', abstract_match.group(1).strip())

        # Introduction: often numbered or clearly headed, capture until next major section
        introduction_match = re.search(
            r"(?i)(?:^|\n)\s*(?:(?:1|i)\s*|)\s*introduction(?:\s*\:|)\s*(?:\n|\r|\r\n)(.*?)(?=\n\s*(?:(?:2|ii)\s*|\d+\s*\.)(?:related work|background|methodology|literature review|section|materials and methods|experiment|proposed method)|(?:\n\s*conclusion(?:s?))|(?=\n\s*(?:acknowledgements|references|appendix))|$)",
            full_text, re.DOTALL
        )
        if introduction_match:
            structured_data["introduction"] = re.sub(r'\s+', ' ', introduction_match.group(1).strip())

        # Conclusion: usually at the end, often numbered, capture until acknowledgements/references or end
        conclusion_match = re.search(
            r"(?i)(?:^|\n)\s*(?:\d+\s*|x*i*v*\s*|)\s*conclusion(?:s?)(?:\s+and\s+discussion)?\s*(?:\n|\r|\r\n)(.*?)(?=\n\s*(?:acknowledgements|references|appendix|copyright|further work)|$)",
            full_text, re.DOTALL
        )
        if conclusion_match:
            structured_data["conclusion"] = re.sub(r'\s+', ' ', conclusion_match.group(1).strip())

        # Construct output filename
        text_filename = os.path.basename(text_filepath)
        json_filename = os.path.splitext(text_filename)[0] + ".json"
        output_filepath = os.path.join(output_dir, json_filename)

        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=4, ensure_ascii=False)
        print(f"Structured content from '{text_filename}' saved to '{output_filepath}'")
        return output_filepath

    except Exception as e:
        print(f"Error structuring content from '{text_filepath}': {e}")
        return None

# Define input and output directories
input_text_dir = "data/extracted_text"
output_structured_dir = "data/structured_sections"

# Ensure output directory exists (already done in previous step, but good practice)
os.makedirs(output_structured_dir, exist_ok=True)

# Iterate through all .txt files in the input directory
txt_files = [f for f in os.listdir(input_text_dir) if f.lower().endswith('.txt')]

if not txt_files:
    print(f"No text files found in '{input_text_dir}'. Please ensure text files are present.")
else:
    print(f"Found {len(txt_files)} text files. Starting content structuring...")
    for txt_file in txt_files:
        full_text_path = os.path.join(input_text_dir, txt_file)
        structure_text_content(full_text_path, output_structured_dir)

print("Content structuring process complete.")

Found 4 text files. Starting content structuring...
Structured content from 'ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.txt' saved to 'data/structured_sections/ChartSumm_A_Comprehensive_Benchmark_for_Automatic_Chart_Summarization_of_Long_and_Short_Summaries.json'
Structured content from 'Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.txt' saved to 'data/structured_sections/Revolutionizing_Content_Digestion_Unleashing_the_Power_of_Bidirectional_and_AutoRegressive_Transformers_in_AIPowered_Automatic_Text_Summarization.json'
Structured content from 'Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.txt' saved to 'data/structured_sections/Beyond_Summarization_Designing_AI_Support_for_RealWorld_Expository_Writing_Tasks.json'
Structured content from 'Accuracy_and_Bias_Mitigation_in_GenAI__LLMbased_Financial_Un