In [1]:
#Required packages
import sys
import subprocess

def install_requirements():
    """Install required packages if not already installed"""
    try:
        import fitz
        print("✓ PyMuPDF already installed")
    except ImportError:
        print("Installing PyMuPDF...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "PyMuPDF", "-q"])
        print("✓ PyMuPDF installed successfully")

# Install dependencies
install_requirements()

import json
import os
import re
import fitz  # PyMuPDF
from datetime import datetime
from collections import defaultdict, Counter
import hashlib

# ====================
# 1. PDF TEXT EXTRACTION
# ====================

class PDFTextExtractor:
    """
    Handles extraction of text from PDF files with cleaning and preprocessing
    """

    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.doc = None
        self.raw_text = ""
        self.cleaned_text = ""
        self.metadata = {}

    def open_pdf(self):
        """Open PDF file and extract metadata"""
        try:
            self.doc = fitz.open(self.pdf_path)
            self.metadata = {
                'page_count': len(self.doc),
                'file_size_mb': os.path.getsize(self.pdf_path) / (1024 * 1024),
                'file_name': os.path.basename(self.pdf_path)
            }
            return True
        except Exception as e:
            print(f"Error opening PDF: {e}")
            return False

    def extract_raw_text(self):
        """Extract raw text from all pages"""
        if not self.doc:
            return ""

        text_blocks = []
        for page_num in range(len(self.doc)):
            page = self.doc[page_num]
            text = page.get_text("text")
            text_blocks.append(text)

        self.raw_text = "\n".join(text_blocks)
        return self.raw_text

    def clean_text(self, text):
        """
        Clean and preprocess extracted text
        - Remove excessive whitespace
        - Fix hyphenation
        - Remove page numbers and headers/footers
        - Normalize line breaks
        """
        if not text:
            return ""

        # Remove page numbers (common patterns)
        text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
        text = re.sub(r'\n\s*Page\s+\d+\s*\n', '\n', text, flags=re.IGNORECASE)

        # Fix hyphenation at line breaks
        text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)

        # Remove excessive whitespace
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)

        # Remove header/footer patterns (common patterns)
        text = re.sub(r'\n[A-Z][a-z\s]+\d{4}\n', '\n', text)

        # Normalize line breaks for better readability
        lines = text.split('\n')
        cleaned_lines = []

        for i, line in enumerate(lines):
            line = line.strip()
            if line:
                # If line is very short and next line exists, might be continuation
                if len(line) < 80 and i + 1 < len(lines) and lines[i + 1].strip():
                    next_line = lines[i + 1].strip()
                    # Check if it looks like a continuation (starts with lowercase)
                    if next_line and next_line[0].islower():
                        line = line + ' '
                cleaned_lines.append(line)

        self.cleaned_text = '\n'.join(cleaned_lines)
        return self.cleaned_text

    def extract_and_clean(self):
        """Main method: extract and clean text"""
        if not self.open_pdf():
            return None

        self.extract_raw_text()
        self.clean_text(self.raw_text)

        return {
            'raw_text': self.raw_text,
            'cleaned_text': self.cleaned_text,
            'metadata': self.metadata,
            'char_count': len(self.cleaned_text),
            'word_count': len(self.cleaned_text.split())
        }

    def close(self):
        """Close PDF document"""
        if self.doc:
            self.doc.close()

# ====================
# 2. SECTION DETECTION & EXTRACTION
# ====================

class SectionExtractor:
    """
    Detects and extracts sections from research papers
    """

    # Common section headers in research papers
    SECTION_PATTERNS = [
        r'^\s*abstract\s*$',
        r'^\s*\d*\.?\s*introduction\s*$',
        r'^\s*\d*\.?\s*related\s+work\s*$',
        r'^\s*\d*\.?\s*background\s*$',
        r'^\s*\d*\.?\s*literature\s+review\s*$',
        r'^\s*\d*\.?\s*methodology\s*$',
        r'^\s*\d*\.?\s*methods\s*$',
        r'^\s*\d*\.?\s*approach\s*$',
        r'^\s*\d*\.?\s*experiments?\s*$',
        r'^\s*\d*\.?\s*results?\s*$',
        r'^\s*\d*\.?\s*evaluation\s*$',
        r'^\s*\d*\.?\s*discussion\s*$',
        r'^\s*\d*\.?\s*conclusion\s*$',
        r'^\s*\d*\.?\s*future\s+work\s*$',
        r'^\s*\d*\.?\s*references?\s*$',
        r'^\s*\d*\.?\s*bibliography\s*$',
    ]

    def __init__(self, text):
        self.text = text
        self.sections = {}

    def detect_sections(self):
        """
        Detect section headers and boundaries in the text
        """
        lines = self.text.split('\n')
        section_markers = []

        for i, line in enumerate(lines):
            line_lower = line.lower().strip()

            # Check against patterns
            for pattern in self.SECTION_PATTERNS:
                if re.match(pattern, line_lower):
                    # Extract section name
                    section_name = re.sub(r'^\d+\.?\s*', '', line_lower).strip()
                    section_markers.append({
                        'line_num': i,
                        'name': section_name,
                        'original': line.strip()
                    })
                    break

        return section_markers

    def extract_sections(self):
        """
        Extract text for each detected section
        """
        markers = self.detect_sections()

        if not markers:
            # If no sections found, return entire text as 'full_text'
            self.sections = {'full_text': self.text}
            return self.sections

        lines = self.text.split('\n')

        for i, marker in enumerate(markers):
            start_line = marker['line_num'] + 1  # Start after header

            # End is either next section or end of document
            if i + 1 < len(markers):
                end_line = markers[i + 1]['line_num']
            else:
                end_line = len(lines)

            # Extract section text
            section_lines = lines[start_line:end_line]
            section_text = '\n'.join(section_lines).strip()

            # Store section
            section_name = marker['name']
            self.sections[section_name] = section_text

        return self.sections

    def get_section(self, section_name):
        """Get specific section by name"""
        section_name = section_name.lower().strip()
        return self.sections.get(section_name, "")

    def get_section_summary(self):
        """Get summary of detected sections"""
        summary = {}
        for name, text in self.sections.items():
            summary[name] = {
                'char_count': len(text),
                'word_count': len(text.split()),
                'line_count': len(text.split('\n'))
            }
        return summary

# ====================
# 3. KEY FINDINGS EXTRACTION
# ====================

class KeyFindingsExtractor:
    """
    Extracts key findings, contributions, and insights from papers
    """

    # Patterns that often indicate key findings
    FINDING_INDICATORS = [
        r'we\s+(demonstrate|show|find|observe|report|achieve|present|propose|introduce)',
        r'our\s+(results|findings|experiments|approach|method|work)\s+(show|demonstrate|indicate|suggest|reveal)',
        r'(significant|substantial|notable|important)\s+(improvement|increase|decrease|reduction|performance)',
        r'(outperforms?|better\s+than|superior\s+to|achieves?\s+state-of-the-art)',
        r'(contributes?|contribution|novel|novelty)',
        r'(in\s+conclusion|to\s+summarize|in\s+summary)',
    ]

    def __init__(self, sections_dict):
        self.sections = sections_dict
        self.findings = []

    def extract_from_abstract(self):
        """Extract findings from abstract"""
        abstract = self.sections.get('abstract', '')
        if not abstract:
            return []

        # Split into sentences
        sentences = re.split(r'[.!?]+', abstract)
        findings = []

        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence.split()) < 5:  # Skip very short sentences
                continue

            # Check if sentence contains finding indicators
            sentence_lower = sentence.lower()
            for pattern in self.FINDING_INDICATORS:
                if re.search(pattern, sentence_lower):
                    findings.append({
                        'text': sentence,
                        'source': 'abstract',
                        'type': 'key_finding'
                    })
                    break

        return findings

    def extract_from_conclusion(self):
        """Extract findings from conclusion"""
        conclusion = self.sections.get('conclusion', '')
        if not conclusion:
            return []

        sentences = re.split(r'[.!?]+', conclusion)
        findings = []

        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence.split()) < 5:
                continue

            sentence_lower = sentence.lower()
            for pattern in self.FINDING_INDICATORS:
                if re.search(pattern, sentence_lower):
                    findings.append({
                        'text': sentence,
                        'source': 'conclusion',
                        'type': 'key_finding'
                    })
                    break

        return findings

    def extract_numerical_results(self):
        """Extract sentences with numerical results/metrics"""
        results_section = self.sections.get('results', '') or self.sections.get('experiments', '')
        if not results_section:
            return []

        # Pattern for percentages, decimals, metrics
        number_pattern = r'\d+\.?\d*\s*%|\d+\.?\d*'

        sentences = re.split(r'[.!?]+', results_section)
        numerical_findings = []

        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence.split()) < 5:
                continue

            # Check if contains numbers
            if re.search(number_pattern, sentence):
                numerical_findings.append({
                    'text': sentence,
                    'source': 'results',
                    'type': 'numerical_result'
                })

        return numerical_findings[:10]  # Limit to top 10

    def extract_all_findings(self):
        """Extract all types of findings"""
        self.findings = []

        # Extract from different sections
        self.findings.extend(self.extract_from_abstract())
        self.findings.extend(self.extract_from_conclusion())
        self.findings.extend(self.extract_numerical_results())

        return self.findings

    def get_top_findings(self, n=5):
        """Get top N findings"""
        if not self.findings:
            self.extract_all_findings()

        # Prioritize: abstract > conclusion > numerical results
        priority = {'abstract': 3, 'conclusion': 2, 'results': 1}

        sorted_findings = sorted(
            self.findings,
            key=lambda x: (priority.get(x['source'], 0), len(x['text'].split())),
            reverse=True
        )

        return sorted_findings[:n]

# ====================
# 4. CROSS-PAPER COMPARISON
# ====================

class CrossPaperAnalyzer:
    """
    Compares and analyzes multiple papers
    """

    def __init__(self, papers_data):
        """
        papers_data: list of dicts with 'text', 'sections', 'findings', 'metadata'
        """
        self.papers = papers_data

    def extract_keywords(self, text, top_n=20):
        """Extract most common keywords from text"""
        # Remove common stop words
        stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
            'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
            'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these',
            'those', 'we', 'our', 'they', 'their', 'it', 'its'
        }

        # Extract words
        words = re.findall(r'\b[a-z]{4,}\b', text.lower())

        # Filter and count
        filtered_words = [w for w in words if w not in stop_words]
        word_counts = Counter(filtered_words)

        return word_counts.most_common(top_n)

    def find_common_themes(self):
        """Find common themes across papers"""
        all_keywords = []

        for paper in self.papers:
            text = paper.get('cleaned_text', '')
            keywords = self.extract_keywords(text, top_n=30)
            all_keywords.extend([kw[0] for kw in keywords])

        # Find keywords that appear in multiple papers
        keyword_counts = Counter(all_keywords)
        common_themes = [kw for kw, count in keyword_counts.items() if count >= len(self.papers) * 0.5]

        return common_themes[:15]

    def compare_methodologies(self):
        """Compare methodologies mentioned in papers"""
        methodology_keywords = [
            'neural network', 'deep learning', 'machine learning', 'algorithm',
            'model', 'training', 'dataset', 'evaluation', 'baseline', 'transformer',
            'attention', 'supervised', 'unsupervised', 'reinforcement'
        ]

        comparison = {}

        for i, paper in enumerate(self.papers):
            paper_name = paper.get('metadata', {}).get('file_name', f'Paper {i+1}')
            methods_found = []

            text_lower = paper.get('cleaned_text', '').lower()
            for method in methodology_keywords:
                if method in text_lower:
                    # Count occurrences
                    count = text_lower.count(method)
                    methods_found.append((method, count))

            comparison[paper_name] = sorted(methods_found, key=lambda x: x[1], reverse=True)[:10]

        return comparison

    def generate_comparison_report(self):
        """Generate comprehensive comparison report"""
        report = {
            'total_papers': len(self.papers),
            'common_themes': self.find_common_themes(),
            'methodology_comparison': self.compare_methodologies(),
            'paper_summaries': []
        }

        for i, paper in enumerate(self.papers):
            summary = {
                'paper_id': i + 1,
                'file_name': paper.get('metadata', {}).get('file_name', f'Paper {i+1}'),
                'word_count': paper.get('word_count', 0),
                'sections_found': list(paper.get('sections', {}).keys()),
                'top_keywords': self.extract_keywords(paper.get('cleaned_text', ''), top_n=10),
                'key_findings_count': len(paper.get('findings', []))
            }
            report['paper_summaries'].append(summary)

        return report

# ====================
# 5. STORAGE & SAVING
# ====================

def save_extracted_data(paper_data, output_dir="data/extracted"):
    """Save extracted text and analysis for a single paper"""
    os.makedirs(output_dir, exist_ok=True)

    # Create filename based on original PDF
    pdf_name = paper_data.get('metadata', {}).get('file_name', 'unknown.pdf')
    base_name = os.path.splitext(pdf_name)[0]

    # Save structured data as JSON
    json_file = os.path.join(output_dir, f"{base_name}_extracted.json")
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(paper_data, f, indent=2, ensure_ascii=False)

    # Save cleaned text separately
    text_file = os.path.join(output_dir, f"{base_name}_cleaned.txt")
    with open(text_file, 'w', encoding='utf-8') as f:
        f.write(paper_data.get('cleaned_text', ''))

    return json_file, text_file

def save_comparison_report(comparison_data, output_dir="data/reports"):
    """Save cross-paper comparison report"""
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_file = os.path.join(output_dir, f"comparison_report_{timestamp}.json")

    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(comparison_data, f, indent=2, ensure_ascii=False)

    return report_file

# ====================
# 6. MAIN PROCESSING PIPELINE
# ====================

def process_single_paper(pdf_path):
    """Process a single paper: extract, clean, analyze"""
    print(f"\nProcessing: {os.path.basename(pdf_path)}")
    print("-" * 60)

    # 1. Extract text
    print("  [1/4] Extracting text...")
    extractor = PDFTextExtractor(pdf_path)
    extraction_result = extractor.extract_and_clean()
    extractor.close()

    if not extraction_result:
        print("  Failed to extract text!")
        return None

    print(f"    Extracted {extraction_result['word_count']:,} words")

    # 2. Detect sections
    print("  [2/4] Detecting sections...")
    section_extractor = SectionExtractor(extraction_result['cleaned_text'])
    sections = section_extractor.extract_sections()
    section_summary = section_extractor.get_section_summary()

    print(f"    Found {len(sections)} sections: {', '.join(sections.keys())}")

    # 3. Extract findings
    print("  [3/4] Extracting key findings...")
    findings_extractor = KeyFindingsExtractor(sections)
    findings = findings_extractor.extract_all_findings()
    top_findings = findings_extractor.get_top_findings(n=5)

    print(f"    Extracted {len(findings)} findings")

    # 4. Compile results
    print("  [4/4] Compiling results...")
    paper_data = {
        'metadata': extraction_result['metadata'],
        'raw_text': extraction_result['raw_text'][:1000] + '...',  # Truncate for storage
        'cleaned_text': extraction_result['cleaned_text'],
        'char_count': extraction_result['char_count'],
        'word_count': extraction_result['word_count'],
        'sections': sections,
        'section_summary': section_summary,
        'findings': findings,
        'top_findings': top_findings,
        'processing_timestamp': datetime.now().isoformat()
    }

    print("  ✓ Complete!")
    return paper_data

def process_all_papers(download_dir="downloads", output_dir="data/extracted"):
    """Process all papers in download directory"""
    print("\n" + "=" * 80)
    print("MODULE 3: TEXT EXTRACTION & ANALYSIS")
    print("=" * 80)

    # Find all PDFs
    if not os.path.exists(download_dir):
        print(f"\n Error: Directory '{download_dir}' not found!")
        print("  Please run Module 2 first to download papers.")
        return None

    pdf_files = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]

    if not pdf_files:
        print(f"\n No PDF files found in '{download_dir}'")
        return None

    print(f"\n Found {len(pdf_files)} PDF files to process")

    # Process each paper
    processed_papers = []

    for i, pdf_file in enumerate(pdf_files):
        pdf_path = os.path.join(download_dir, pdf_file)

        print(f"\n{'='*60}")
        print(f"Paper {i+1}/{len(pdf_files)}")
        print(f"{'='*60}")

        paper_data = process_single_paper(pdf_path)

        if paper_data:
            # Save individual paper data
            json_file, text_file = save_extracted_data(paper_data, output_dir)
            print(f"\n  Saved to:")
            print(f"    JSON: {json_file}")
            print(f"    Text: {text_file}")

            processed_papers.append(paper_data)
        else:
            print(f"  Failed to process {pdf_file}")

    return processed_papers

def perform_cross_paper_analysis(processed_papers, output_dir="data/reports"):
    """Perform cross-paper comparison and analysis"""
    if not processed_papers or len(processed_papers) < 2:
        print("\n Need at least 2 papers for cross-paper analysis")
        return None

    print("\n" + "=" * 80)
    print("CROSS-PAPER ANALYSIS")
    print("=" * 80)

    print(f"\n Comparing {len(processed_papers)} papers...")

    # Perform analysis
    analyzer = CrossPaperAnalyzer(processed_papers)
    comparison_report = analyzer.generate_comparison_report()

    # Save report
    report_file = save_comparison_report(comparison_report, output_dir)

    # Print summary
    print("\n COMPARISON SUMMARY")
    print("-" * 60)
    print(f"\n Common Themes:")
    for theme in comparison_report['common_themes'][:10]:
        print(f"   • {theme}")

    print(f"\n Paper Statistics:")
    for summary in comparison_report['paper_summaries']:
        print(f"\n   {summary['file_name']}")
        print(f"     Words: {summary['word_count']:,}")
        print(f"     Sections: {len(summary['sections_found'])}")
        print(f"     Key findings: {summary['key_findings_count']}")

    print(f"\n Report saved to: {report_file}")

    return comparison_report

# ====================
# 7. MAIN EXECUTION
# ====================

def main():
    """Main execution function for Module 3"""

    # Process all papers
    processed_papers = process_all_papers(
        download_dir="downloads",
        output_dir="data/extracted"
    )

    if not processed_papers:
        print("\n No papers were successfully processed.")
        return

    print(f"\n" + "=" * 80)
    print(f" Successfully processed {len(processed_papers)} papers!")
    print("=" * 80)

    # Perform cross-paper analysis
    comparison_report = perform_cross_paper_analysis(
        processed_papers,
        output_dir="data/reports"
    )

    print("\n" + "=" * 80)
    print(" MODULE 3 COMPLETE!")
    print("=" * 80)
    print("\n Outputs:")
    print("   • Extracted texts: data/extracted/")
    print("   • Comparison report: data/reports/")
    print("\n" + "=" * 80)

if __name__ == "__main__":
    main()

Installing PyMuPDF...
✓ PyMuPDF installed successfully

MODULE 3: TEXT EXTRACTION & ANALYSIS

 Error: Directory 'downloads' not found!
  Please run Module 2 first to download papers.

 No papers were successfully processed.


In [4]:
import os

# Create the 'downloads' directory if it doesn't exist
download_dir = "downloads"
os.makedirs(download_dir, exist_ok=True)
print(f"Created directory: {os.path.abspath(download_dir)}")
print("Please place your PDF files into this directory for processing.")

Created directory: /content/downloads
Please place your PDF files into this directory for processing.


In [3]:
main()


MODULE 3: TEXT EXTRACTION & ANALYSIS

 No PDF files found in 'downloads'

 No papers were successfully processed.
