In [1]:
!pip install semanticscholar langchain langgraph openai PyMuPDF4LLM python-dotenv gradio

Collecting semanticscholar
  Downloading semanticscholar-0.11.0-py3-none-any.whl.metadata (3.8 kB)
Collecting PyMuPDF4LLM
  Downloading pymupdf4llm-0.2.7-py3-none-any.whl.metadata (7.5 kB)
Collecting pymupdf>=1.26.6 (from PyMuPDF4LLM)
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading semanticscholar-0.11.0-py3-none-any.whl (26 kB)
Downloading pymupdf4llm-0.2.7-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, PyMuPDF4LLM, semanticscholar
Successfully installed PyMuPDF4LLM-0.2.7 pymupdf-1.26.7 semanticscholar-0.11.0


In [2]:
import os

with open(".env", "w") as f:
    f.write("SEMANTIC_SCHOLAR_API_KEY=QyzAnc3la76icrOJH4oc72S3PG0c4DAOPO6sjb6e\n")

In [3]:
import os
import json
import requests
import logging
import re
from datetime import datetime
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

# Load Environment Variables
load_dotenv()

# API Key Handling
API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
if not API_KEY:
    print("API Key not found in environment.")
    API_KEY = input("Please enter your Semantic Scholar API Key: ").strip()

# Initialize Client
sch = SemanticScholar(api_key=API_KEY)

class ResearchAssistant:
    def __init__(self, data_dir="data"):
        # Set the target directory directly to data/pdfs
        self.data_dir = data_dir
        self.pdf_save_path = os.path.join(self.data_dir, "pdfs")

        # Ensure the directory exists
        os.makedirs(self.pdf_save_path, exist_ok=True)

    def sanitize_filename(self, text):
        """Creates a safe filename from a string."""
        s = re.sub(r'[\\/*?:"<>|]', "", text)
        return s.strip()[:100]

    def get_search_parameters(self):
        """Captures user input for dynamic research topics."""
        print("\n--- Research Configuration ---")
        topic = input("Enter research topic: ").strip()

        print("Optional Filters (press Enter to skip):")
        min_year = input("  Minimum Publication Year (e.g., 2020): ").strip()
        min_citations = input("  Minimum Citations (e.g., 10): ").strip()

        return {
            "topic": topic,
            "min_year": int(min_year) if min_year.isdigit() else 2015,
            "min_citations": int(min_citations) if min_citations.isdigit() else 0
        }

    def search_and_rank(self, params, fetch_limit=50, selection_limit=3):
        """Searches papers, filters them, and ranks them."""
        logger.info(f"Searching for papers on: {params['topic']}")
        try:
            results = sch.search_paper(
                params['topic'],
                limit=fetch_limit,
                fields=['title', 'authors', 'year', 'citationCount', 'openAccessPdf', 'url', 'abstract', 'venue']
            )
        except Exception as e:
            logger.error(f"Search API failed: {str(e)}")
            return []

        candidates = []
        for paper in results:
            if not paper.title or not paper.year:
                continue
            if paper.year < params['min_year']:
                continue
            if (paper.citationCount or 0) < params['min_citations']:
                continue

            recency_score = max(0, (paper.year - (datetime.now().year - 5))) * 2
            citations = paper.citationCount or 0
            impact_score = min(citations / 10, 20)
            pdf_url = paper.openAccessPdf['url'] if paper.openAccessPdf else None
            access_score = 50 if pdf_url else 0
            total_score = recency_score + impact_score + access_score

            candidates.append({
                "paperId": paper.paperId,
                "title": paper.title,
                "authors": [a['name'] for a in paper.authors] if paper.authors else [],
                "year": paper.year,
                "citations": citations,
                "venue": paper.venue,
                "url": paper.url,
                "pdf_url": pdf_url,
                "score": total_score
            })

        candidates.sort(key=lambda x: x['score'], reverse=True)
        selected = candidates[:selection_limit]
        logger.info(f"Screened {len(candidates)} papers. Selected top {len(selected)}.")
        return selected

    def download_pdfs(self, papers, topic):
        """Downloads PDFs directly into data/pdfs/."""
        logger.info(f"Starting PDF downloads into: {self.pdf_save_path}")

        successful_downloads = []
        for paper in papers:
            if not paper['pdf_url']:
                logger.warning(f"Skipping download (No Open Access URL): {paper['title']}")
                continue

            # Generate filename
            first_author = paper['authors'][0].split()[-1] if paper['authors'] else "Unknown"
            safe_title = self.sanitize_filename(paper['title'])
            filename = f"{paper['year']}_{first_author}_{safe_title}.pdf"

            # Simple save path: data/pdfs/filename.pdf
            save_path = os.path.join(self.pdf_save_path, filename)

            try:
                response = requests.get(paper['pdf_url'], timeout=30)
                if response.status_code == 200 and b"%PDF" in response.content[:20]:
                    with open(save_path, "wb") as f:
                        f.write(response.content)

                    paper['local_path'] = save_path
                    paper['download_status'] = "Success"
                    successful_downloads.append(paper)
                    logger.info(f"Downloaded: {filename}")
                else:
                    logger.warning(f"Invalid PDF content for: {paper['title']}")
            except Exception as e:
                logger.error(f"Download error for {paper['title']}: {e}")

        # Save Metadata in data/pdfs/ as well
        metadata_path = os.path.join(self.pdf_save_path, "dataset_metadata.json")
        with open(metadata_path, "w") as f:
            json.dump(papers, f, indent=4)

        logger.info(f"Process Complete. Metadata saved to: {metadata_path}")
        return successful_downloads

if __name__ == "__main__":
    assistant = ResearchAssistant()
    params = assistant.get_search_parameters()

    if params['topic']:
        top_papers = assistant.search_and_rank(params)
        if top_papers:
            print(f"\nTop {len(top_papers)} Papers Selected:")
            for i, p in enumerate(top_papers, 1):
                print(f"{i}. [{p['year']}] {p['title']} (Citations: {p['citations']})")

            assistant.download_pdfs(top_papers, params['topic'])
        else:
            logger.warning("No suitable papers found matching criteria.")
    else:
        logger.error("Topic is required.")



--- Research Configuration ---
Enter research topic: machine learning
Optional Filters (press Enter to skip):
  Minimum Publication Year (e.g., 2020): 2020
  Minimum Citations (e.g., 10): 5

Top 3 Papers Selected:
1. [2024] Evaluation metrics and statistical tests for machine learning (Citations: 709)
2. [2024] Leveraging large language models for predictive chemistry (Citations: 297)
3. [2023] Understanding of Machine Learning with Deep Learning: Architectures, Workflow, Applications and Future Directions (Citations: 729)




In [4]:
import json
import os
import re
import logging
import concurrent.futures
from typing import List, Dict, Optional, Tuple
from datetime import datetime
from collections import defaultdict, Counter

# ==================== CONFIGURATION ====================
class Config:
    DATA_DIR = "data"
    PDF_DIR = os.path.join(DATA_DIR, "pdfs")
    OUTPUT_DIR = os.path.join(DATA_DIR, "processed")
    LOGS_DIR = "logs"

    # Files
    MASTER_DATA = os.path.join(OUTPUT_DIR, "master_extraction.json")
    FINDINGS_DATA = os.path.join(OUTPUT_DIR, "key_findings.json")
    VALIDATION_REPORT = os.path.join(OUTPUT_DIR, "validation_report.json")

    # Section mapping (Compiled Regex for speed)
    SECTIONS = {
        'abstract': re.compile(r'(?i)^abstract\s*$|^summary\s*$'),
        'intro': re.compile(r'(?i)^1\.?\s*introduction|^introduction\s*$'),
        'method': re.compile(r'(?i)^3\.?\s*method|^method\s*$|^approach\s*$'),
        'results': re.compile(r'(?i)^4\.?\s*results|^results\s*$|^evaluation\s*$'),
        'conclusion': re.compile(r'(?i)^6\.?\s*conclusion|^conclusion\s*$')
    }

    @staticmethod
    def init():
        for d in [Config.OUTPUT_DIR, Config.LOGS_DIR]:
            os.makedirs(d, exist_ok=True)

# ==================== ANALYTICS ENGINE ====================
class AnalyticsEngine:
    """Extracts findings and validates data quality"""

    def __init__(self):
        # Patterns for key findings
        self.contribution_patterns = [
            re.compile(r'(?i)(?:we|this paper|our)\s+(?:propose|present|introduce|develop)'),
            re.compile(r'(?i)main\s+contribution')
        ]
        self.result_patterns = [
            re.compile(r'(?i)(?:results|experiments)\s+(?:show|demonstrate|indicate)'),
            re.compile(r'(\d+(?:\.\d+)?)\s*%'), # Percentage results
            re.compile(r'(?i)accuracy\s+of\s+(\d+(?:\.\d+)?)')
        ]

    def extract_key_findings(self, paper: Dict) -> Dict:
        """Heuristic-based extraction of research highlights"""
        findings = {
            'filename': paper['filename'],
            'contributions': [],
            'numerical_results': [],
            'top_terms': []
        }

        full_text = paper.get('raw_text', '')
        sentences = re.split(r'(?<=[.!?])\s+', full_text)

        # 1. Identify Contributions
        for sent in sentences:
            if any(p.search(sent) for p in self.contribution_patterns):
                if 20 < len(sent) < 300:
                    findings['contributions'].append(sent.strip())

        # 2. Extract Numerical Results
        for sent in sentences:
            if any(p.search(sent) for p in self.result_patterns):
                findings['numerical_results'].append(sent.strip())

        # 3. Frequency Analysis
        words = re.findall(r'\b[a-z]{5,}\b', full_text.lower())
        findings['top_terms'] = [w for w, c in Counter(words).most_common(10)]

        return findings

    def validate_quality(self, paper: Dict) -> Dict:
        """Calculates a quality score based on extraction completeness"""
        score = 0
        issues = []

        # Criterion 1: Length
        word_count = paper['metadata']['word_count']
        if word_count > 2000: score += 40
        elif word_count > 500: score += 20
        else: issues.append("Critically low word count")

        # Criterion 2: Section Density
        sections_found = len(paper.get('sections', {}))
        if sections_found >= 4: score += 30
        else: issues.append(f"Missing major sections (found {sections_found})")

        # Criterion 3: Formatting
        if len(paper.get('markdown', '')) > len(paper.get('raw_text', '')): score += 30
        else: issues.append("Markdown conversion suboptimal")

        return {
            'filename': paper['filename'],
            'score': score,
            'status': 'PASS' if score >= 60 else 'FLAG',
            'issues': issues
        }

# ==================== MAIN PIPELINE ====================
class Milestone2Pipeline:
    def __init__(self):
        Config.init()
        self.logger = self._setup_logger()
        self.analytics = AnalyticsEngine()

    def _setup_logger(self):
        logger = logging.getLogger("Pipeline")
        logger.setLevel(logging.INFO)
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logger.addHandler(handler)
        return logger

    def process_pdf(self, path: str):
        """Unified extraction task for parallel execution"""
        filename = os.path.basename(path)
        try:
            # Stage 1: Extraction
            doc = fitz.open(path)
            raw = "\n".join([p.get_text() for p in doc])
            md = pymupdf4llm.to_markdown(path)
            doc.close()

            paper_data = {
                'filename': filename,
                'raw_text': raw,
                'markdown': md,
                'metadata': {'word_count': len(raw.split())},
                'sections': self._split_sections(raw)
            }

            # Stage 2: Analysis & Validation
            findings = self.analytics.extract_key_findings(paper_data)
            validation = self.analytics.validate_quality(paper_data)

            return paper_data, findings, validation
        except Exception as e:
            return None, None, {'filename': filename, 'status': 'FAIL', 'issues': [str(e)]}

    def _split_sections(self, text: str) -> Dict:
        sections = {}
        # Simple split logic based on Config.SECTIONS
        return sections # Placeholder for logic in previous step

    def run(self):
        paths = [os.path.join(Config.PDF_DIR, f) for f in os.listdir(Config.PDF_DIR) if f.endswith('.pdf')]
        self.logger.info(f"Initiating high-speed processing for {len(paths)} files")

        all_papers, all_findings, all_validations = [], [], []

        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = list(executor.map(self.process_pdf, paths))

        for p, f, v in results:
            if p: all_papers.append(p)
            if f: all_findings.append(f)
            all_validations.append(v)

        # Persistence
        self._save(all_papers, Config.MASTER_DATA)
        self._save(all_findings, Config.FINDINGS_DATA)
        self._save(all_validations, Config.VALIDATION_REPORT)

        success_rate = len([v for v in all_validations if v['status'] == 'PASS'])
        self.logger.info(f"Pipeline Complete. Success Rate: {success_rate}/{len(paths)}")

    def _save(self, data, path):
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

if __name__ == "__main__":
    Milestone2Pipeline().run()


2025-12-25 02:20:50,618 - INFO - Initiating high-speed processing for 2 files
INFO:Pipeline:Initiating high-speed processing for 2 files
2025-12-25 02:20:50,653 - INFO - Pipeline Complete. Success Rate: 0/2
INFO:Pipeline:Pipeline Complete. Success Rate: 0/2
