In [1]:
import os
import PyPDF2
import pdfplumber
import fitz  # PyMuPDF
from pathlib import Path
import pandas as pd
from datetime import datetime
import re
from PIL import Image
import pytesseract
import io
import json
import shutil

class PDFIngesterHierarchical:
    def __init__(self, root_folder_path, output_folder_path=None):
        self.root_folder_path = Path(root_folder_path)
        self.output_folder_path = Path(output_folder_path) if output_folder_path else Path(f"{root_folder_path}_extracted")
        self.root_pdfs = []  # PDFs directly in root folder
        self.college_pdfs = {}  # PDFs in college folders: {college_name: [pdfs]}
        self.year_pdfs = {}  # PDFs in year folders: {college_name: {year: [pdfs]}}
        self.ingestion_log = []
        
        # Create output directory structure
        self.create_output_structure()
    
    def create_output_structure(self):
        """Create the output directory structure"""
        print(f"Creating output directory structure at: {self.output_folder_path}")
        
        # Create main output directory
        self.output_folder_path.mkdir(parents=True, exist_ok=True)
        
        # Create subdirectories
        (self.output_folder_path / "text_files").mkdir(exist_ok=True)
        (self.output_folder_path / "metadata").mkdir(exist_ok=True)
        (self.output_folder_path / "reports").mkdir(exist_ok=True)
        (self.output_folder_path / "logs").mkdir(exist_ok=True)
        
        print(f"✓ Output structure created at: {self.output_folder_path}")
    
    def is_year_folder(self, folder_name):
        """Check if folder name represents a year (4 digits)"""
        try:
            year = int(folder_name)
            return 1900 <= year <= 2100  # Reasonable year range
        except ValueError:
            return False
    
    def safe_filename(self, filename):
        """Create a safe filename for output files"""
        # Remove or replace invalid characters
        safe_name = re.sub(r'[<>:"/\\|?*]', '_', filename)
        safe_name = re.sub(r'[^\w\s.-]', '_', safe_name)
        return safe_name
    
    def extract_pdf_text(self, pdf_path):
        """Extract text content from PDF file using multiple methods for best results"""
        print(f"    Extracting text from: {pdf_path.name}")
        
        # Method 1: Try pdfplumber (best for tables and complex layouts)
        text_pdfplumber = self._extract_with_pdfplumber(pdf_path)
        
        # Method 2: Try PyMuPDF (good for general text)
        text_pymupdf = self._extract_with_pymupdf(pdf_path)
        
        # Method 3: Try PyPDF2 (fallback)
        text_pypdf2 = self._extract_with_pypdf2(pdf_path)
        
        # Method 4: OCR for scanned PDFs (if other methods fail)
        text_ocr = None
        if not any([text_pdfplumber, text_pymupdf, text_pypdf2]) or \
           all(len(text.strip()) < 100 for text in [text_pdfplumber, text_pymupdf, text_pypdf2] if text):
            print(f"      Performing OCR (document appears to be scanned)...")
            text_ocr = self._extract_with_ocr(pdf_path)
        
        # Choose the best extraction result
        best_text = self._choose_best_text(text_pdfplumber, text_pymupdf, text_pypdf2, text_ocr)
        
        if best_text:
            # Clean and normalize the text
            cleaned_text = self._clean_text(best_text)
            print(f"      ✓ Extracted {len(cleaned_text)} characters")
            return cleaned_text
        else:
            print(f"      ✗ Failed to extract text")
            return None
    
    def _extract_with_pdfplumber(self, pdf_path):
        """Extract text using pdfplumber (best for tables and complex layouts)"""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                    
                    # Also extract tables separately
                    tables = page.extract_tables()
                    for table in tables:
                        for row in table:
                            if row:
                                text += " | ".join(str(cell) if cell else "" for cell in row) + "\n"
                
                return text.strip() if text.strip() else None
        except Exception as e:
            print(f"      pdfplumber failed: {str(e)}")
            return None
    
    def _extract_with_pymupdf(self, pdf_path):
        """Extract text using PyMuPDF (good for general text)"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                page_text = page.get_text()
                if page_text:
                    text += page_text + "\n"
            doc.close()
            return text.strip() if text.strip() else None
        except Exception as e:
            print(f"      PyMuPDF failed: {str(e)}")
            return None
    
    def _extract_with_pypdf2(self, pdf_path):
        """Extract text using PyPDF2 (fallback method)"""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                return text.strip() if text.strip() else None
        except Exception as e:
            print(f"      PyPDF2 failed: {str(e)}")
            return None
    
    def _extract_with_ocr(self, pdf_path):
        """Extract text using OCR for scanned PDFs"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                
                # Convert page to image
                mat = fitz.Matrix(2, 2)  # 2x zoom for better OCR
                pix = page.get_pixmap(matrix=mat)
                img_data = pix.tobytes("png")
                
                # Convert to PIL Image
                image = Image.open(io.BytesIO(img_data))
                
                # Perform OCR
                page_text = pytesseract.image_to_string(image, lang='eng')
                if page_text:
                    text += page_text + "\n"
            
            doc.close()
            return text.strip() if text.strip() else None
        except Exception as e:
            print(f"      OCR failed: {str(e)}")
            return None
    
    def _choose_best_text(self, text_pdfplumber, text_pymupdf, text_pypdf2, text_ocr):
        """Choose the best text extraction result"""
        texts = [
            ("pdfplumber", text_pdfplumber),
            ("PyMuPDF", text_pymupdf),
            ("PyPDF2", text_pypdf2),
            ("OCR", text_ocr)
        ]
        
        # Filter out None results
        valid_texts = [(method, text) for method, text in texts if text and text.strip()]
        
        if not valid_texts:
            return None
        
        # Choose the longest text (usually indicates better extraction)
        best_method, best_text = max(valid_texts, key=lambda x: len(x[1]))
        print(f"      Best method: {best_method}")
        
        return best_text
    
    def _clean_text(self, text):
        """Clean and normalize extracted text"""
        if not text:
            return ""
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove control characters
        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
        
        # Fix common OCR errors
        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Add space between lowercase and uppercase
        text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)  # Add space between digits and letters
        text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)  # Add space between letters and digits
        
        # Normalize line breaks
        text = re.sub(r'\n+', '\n', text)
        
        return text.strip()
    
    def get_pdf_metadata(self, pdf_path):
        """Extract comprehensive metadata from PDF file"""
        try:
            metadata = {}
            
            # Try PyMuPDF first (more comprehensive)
            try:
                doc = fitz.open(pdf_path)
                pdf_metadata = doc.metadata
                metadata.update({
                    'title': pdf_metadata.get('title', ''),
                    'author': pdf_metadata.get('author', ''),
                    'creator': pdf_metadata.get('creator', ''),
                    'producer': pdf_metadata.get('producer', ''),
                    'subject': pdf_metadata.get('subject', ''),
                    'creation_date': pdf_metadata.get('creationDate', ''),
                    'modification_date': pdf_metadata.get('modDate', ''),
                    'page_count': len(doc),
                    'encrypted': doc.is_encrypted,
                    'pdf_version': doc.pdf_version()
                })
                doc.close()
            except:
                # Fallback to PyPDF2
                with open(pdf_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    pdf_metadata = pdf_reader.metadata if pdf_reader.metadata else {}
                    metadata.update({
                        'title': pdf_metadata.get('/Title', ''),
                        'author': pdf_metadata.get('/Author', ''),
                        'creator': pdf_metadata.get('/Creator', ''),
                        'producer': pdf_metadata.get('/Producer', ''),
                        'subject': pdf_metadata.get('/Subject', ''),
                        'creation_date': pdf_metadata.get('/CreationDate', ''),
                        'modification_date': pdf_metadata.get('/ModDate', ''),
                        'page_count': len(pdf_reader.pages),
                        'encrypted': pdf_reader.is_encrypted,
                        'pdf_version': getattr(pdf_reader, 'pdf_header', '')
                    })
            
            return metadata
        except Exception as e:
            print(f"Error extracting metadata from {pdf_path}: {str(e)}")
            return {'page_count': 0, 'encrypted': False}
    
    def save_individual_files(self, pdf_info, relative_path=""):
        """Save individual text and metadata files for each PDF"""
        safe_name = self.safe_filename(pdf_info['file_name'])
        base_name = safe_name.replace('.pdf', '')
        
        # Create directory structure in output folder
        text_output_dir = self.output_folder_path / "text_files" / relative_path
        metadata_output_dir = self.output_folder_path / "metadata" / relative_path
        
        text_output_dir.mkdir(parents=True, exist_ok=True)
        metadata_output_dir.mkdir(parents=True, exist_ok=True)
        
        # Initialize paths
        text_file_path = None
        metadata_file_path = None
        
        # Save text content
        if pdf_info.get('text_content'):
            text_file_path = text_output_dir / f"{base_name}.txt"
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(pdf_info['text_content'])
            print(f"      ✓ Text saved to: {text_file_path}")
        else:
            print(f"      ⚠ No text content to save for: {pdf_info['file_name']}")
        
        # Save metadata as JSON
        metadata_file_path = metadata_output_dir / f"{base_name}_metadata.json"
        metadata_to_save = {k: v for k, v in pdf_info.items() if k != 'text_content'}
        with open(metadata_file_path, 'w', encoding='utf-8') as f:
            json.dump(metadata_to_save, f, indent=2, ensure_ascii=False, default=str)
        print(f"      ✓ Metadata saved to: {metadata_file_path}")
        
        return text_file_path, metadata_file_path
    
    def ingest_root_pdfs(self):
        """Ingest PDF files directly in the root folder"""
        print("Ingesting PDFs from root folder...")
        
        for file_path in self.root_folder_path.glob("*.pdf"):
            pdf_info = {
                'file_path': str(file_path),
                'file_name': file_path.name,
                'location': 'root',
                'college': None,
                'year': None,
                'file_size': file_path.stat().st_size,
                'ingestion_time': datetime.now().isoformat()
            }
            
            # Extract text content
            text_content = self.extract_pdf_text(file_path)
            pdf_info['text_content'] = text_content
            
            # Extract metadata
            metadata = self.get_pdf_metadata(file_path)
            pdf_info.update(metadata)
            
            # Save individual files
            self.save_individual_files(pdf_info, "root")
            
            self.root_pdfs.append(pdf_info)
            self.ingestion_log.append(f"Ingested root PDF: {file_path.name}")
            print(f"  ✓ Processed: {file_path.name}")
    
    def ingest_college_and_year_pdfs(self):
        """Ingest PDFs from college folders and their year subfolders"""
        print("Ingesting PDFs from college folders and year subfolders...")
        
        for college_folder in self.root_folder_path.iterdir():
            if college_folder.is_dir():
                college_name = college_folder.name
                print(f"  Processing college: {college_name}")
                
                # Initialize college in dictionaries
                self.college_pdfs[college_name] = []
                self.year_pdfs[college_name] = {}
                
                # Process PDFs directly in college folder
                college_pdf_count = 0
                for pdf_file in college_folder.glob("*.pdf"):
                    pdf_info = {
                        'file_path': str(pdf_file),
                        'file_name': pdf_file.name,
                        'location': 'college_folder',
                        'college': college_name,
                        'year': None,
                        'file_size': pdf_file.stat().st_size,
                        'ingestion_time': datetime.now().isoformat()
                    }
                    
                    # Extract text content
                    text_content = self.extract_pdf_text(pdf_file)
                    pdf_info['text_content'] = text_content
                    
                    # Extract metadata
                    metadata = self.get_pdf_metadata(pdf_file)
                    pdf_info.update(metadata)
                    
                    # Save individual files
                    self.save_individual_files(pdf_info, f"colleges/{college_name}")
                    
                    self.college_pdfs[college_name].append(pdf_info)
                    self.ingestion_log.append(f"Ingested college PDF: {college_name}/{pdf_file.name}")
                    college_pdf_count += 1
                    print(f"    ✓ College PDF: {pdf_file.name}")
                
                # Process year subfolders within college folder
                year_folders_processed = 0
                for subfolder in college_folder.iterdir():
                    if subfolder.is_dir() and self.is_year_folder(subfolder.name):
                        year = subfolder.name
                        print(f"    Processing year folder: {year}")
                        
                        self.year_pdfs[college_name][year] = []
                        
                        # Process PDFs in year folder
                        for pdf_file in subfolder.glob("*.pdf"):
                            pdf_info = {
                                'file_path': str(pdf_file),
                                'file_name': pdf_file.name,
                                'location': 'year_folder',
                                'college': college_name,
                                'year': year,
                                'file_size': pdf_file.stat().st_size,
                                'ingestion_time': datetime.now().isoformat()
                            }
                            
                            # Extract text content
                            text_content = self.extract_pdf_text(pdf_file)
                            pdf_info['text_content'] = text_content
                            
                            # Extract metadata
                            metadata = self.get_pdf_metadata(pdf_file)
                            pdf_info.update(metadata)
                            
                            # Save individual files
                            self.save_individual_files(pdf_info, f"colleges/{college_name}/{year}")
                            
                            self.year_pdfs[college_name][year].append(pdf_info)
                            self.ingestion_log.append(f"Ingested year PDF: {college_name}/{year}/{pdf_file.name}")
                            print(f"      ✓ Year PDF: {pdf_file.name}")
                        
                        year_folders_processed += 1
                
                print(f"    College '{college_name}': {college_pdf_count} PDFs, {year_folders_processed} year folders")
                
                # Remove college from dictionaries if no PDFs found
                if not self.college_pdfs[college_name] and not self.year_pdfs[college_name]:
                    del self.college_pdfs[college_name]
                    del self.year_pdfs[college_name]
    
    def generate_summary_reports(self):
        """Generate comprehensive summary reports"""
        print("Generating summary reports...")
        
        # Generate master CSV
        all_pdfs = []
        all_pdfs.extend(self.root_pdfs)
        
        for college, pdfs in self.college_pdfs.items():
            all_pdfs.extend(pdfs)
        
        for college, years in self.year_pdfs.items():
            for year, pdfs in years.items():
                all_pdfs.extend(pdfs)
        
        if all_pdfs:
            # Create DataFrame without text content for CSV (too large)
            csv_data = []
            for pdf in all_pdfs:
                csv_row = {k: v for k, v in pdf.items() if k != 'text_content'}
                csv_row['text_length'] = len(pdf.get('text_content', '')) if pdf.get('text_content') else 0
                csv_data.append(csv_row)
            
            df = pd.DataFrame(csv_data)
            csv_path = self.output_folder_path / "reports" / "master_summary.csv"
            df.to_csv(csv_path, index=False)
            print(f"✓ Master CSV saved to: {csv_path}")
            
            # Generate detailed JSON report
            json_path = self.output_folder_path / "reports" / "detailed_report.json"
            detailed_report = {
                'summary': {
                    'total_pdfs': len(all_pdfs),
                    'root_pdfs': len(self.root_pdfs),
                    'college_pdfs': sum(len(pdfs) for pdfs in self.college_pdfs.values()),
                    'year_pdfs': sum(len(pdfs) for college in self.year_pdfs.values() for pdfs in college.values()),
                    'processing_time': datetime.now().isoformat(),
                    'input_folder': str(self.root_folder_path),
                    'output_folder': str(self.output_folder_path)
                },
                'root_pdfs': self.root_pdfs,
                'college_pdfs': self.college_pdfs,
                'year_pdfs': self.year_pdfs,
                'ingestion_log': self.ingestion_log
            }
            
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(detailed_report, f, indent=2, ensure_ascii=False, default=str)
            print(f"✓ Detailed JSON report saved to: {json_path}")
            
            # Generate processing log
            log_path = self.output_folder_path / "logs" / "processing_log.txt"
            with open(log_path, 'w', encoding='utf-8') as f:
                f.write(f"PDF Ingestion Log - {datetime.now().isoformat()}\n")
                f.write("=" * 60 + "\n\n")
                f.write(f"Input Folder: {self.root_folder_path}\n")
                f.write(f"Output Folder: {self.output_folder_path}\n\n")
                f.write(f"Total PDFs Processed: {len(all_pdfs)}\n")
                f.write(f"Root PDFs: {len(self.root_pdfs)}\n")
                f.write(f"College PDFs: {sum(len(pdfs) for pdfs in self.college_pdfs.values())}\n")
                f.write(f"Year PDFs: {sum(len(pdfs) for college in self.year_pdfs.values() for pdfs in college.values())}\n\n")
                f.write("Processing Log:\n")
                f.write("-" * 40 + "\n")
                for log_entry in self.ingestion_log:
                    f.write(f"{log_entry}\n")
            
            print(f"✓ Processing log saved to: {log_path}")
    
    def ingest_all(self):
        """Ingest all PDF files from root, college folders, and year subfolders"""
        print(f"Starting PDF ingestion from: {self.root_folder_path}")
        print(f"Output will be saved to: {self.output_folder_path}")
        print("="*70)
        
        self.ingest_root_pdfs()
        print()
        self.ingest_college_and_year_pdfs()
        print()
        self.generate_summary_reports()
        
        print("\n" + "="*70)
        print("Ingestion Summary:")
        print(f"📄 Root PDFs: {len(self.root_pdfs)}")
        print(f"🏫 Colleges processed: {len(self.college_pdfs)}")
        
        total_college_pdfs = sum(len(pdfs) for pdfs in self.college_pdfs.values())
        total_year_pdfs = 0
        total_year_folders = 0
        
        for college, years in self.year_pdfs.items():
            total_year_folders += len(years)
            for year, pdfs in years.items():
                total_year_pdfs += len(pdfs)
        
        print(f"📁 College-level PDFs: {total_college_pdfs}")
        print(f"📅 Year folders processed: {total_year_folders}")
        print(f"📄 Year-level PDFs: {total_year_pdfs}")
        print(f"📊 Total PDFs processed: {len(self.root_pdfs) + total_college_pdfs + total_year_pdfs}")
        print(f"💾 Output saved to: {self.output_folder_path}")
    
    def get_root_pdfs(self):
        """Return list of PDFs from root folder"""
        return self.root_pdfs
    
    def get_college_pdfs(self, college_name=None):
        """Return PDFs from specific college or all colleges"""
        if college_name:
            return self.college_pdfs.get(college_name, [])
        return self.college_pdfs
    
    def get_year_pdfs(self, college_name=None, year=None):
        """Return PDFs from specific college/year combination or all"""
        if college_name and year:
            return self.year_pdfs.get(college_name, {}).get(str(year), [])
        elif college_name:
            return self.year_pdfs.get(college_name, {})
        return self.year_pdfs
    
    def search_content(self, search_term, case_sensitive=False):
        """Search for specific content across all ingested PDFs"""
        results = []
        
        # Search in root PDFs
        for pdf in self.root_pdfs:
            if pdf['text_content']:
                content = pdf['text_content'] if case_sensitive else pdf['text_content'].lower()
                term = search_term if case_sensitive else search_term.lower()
                
                if term in content:
                    results.append({
                        'file_name': pdf['file_name'],
                        'location': pdf['location'],
                        'college': None,
                        'year': pdf['year'],
                        'file_path': pdf['file_path']
                    })
        
        # Search in college PDFs
        for college, pdfs in self.college_pdfs.items():
            for pdf in pdfs:
                if pdf['text_content']:
                    content = pdf['text_content'] if case_sensitive else pdf['text_content'].lower()
                    term = search_term if case_sensitive else search_term.lower()
                    
                    if term in content:
                        results.append({
                            'file_name': pdf['file_name'],
                            'location': pdf['location'],
                            'college': pdf['college'],
                            'year': pdf['year'],
                            'file_path': pdf['file_path']
                        })
        
        # Search in year folder PDFs
        for college, years in self.year_pdfs.items():
            for year, pdfs in years.items():
                for pdf in pdfs:
                    if pdf['text_content']:
                        content = pdf['text_content'] if case_sensitive else pdf['text_content'].lower()
                        term = search_term if case_sensitive else search_term.lower()
                        
                        if term in content:
                            results.append({
                                'file_name': pdf['file_name'],
                                'location': pdf['location'],
                                'college': pdf['college'],
                                'year': pdf['year'],
                                'file_path': pdf['file_path']
                            })
        
        return results

# Usage Example
if __name__ == "__main__":
    print("PDF Ingestion Tool - Hierarchical Output Version")
    print("=" * 60)
    
    # Get input folder path
    root_folder = input("Enter the full path to your input folder: ").strip()
    
    # Remove quotes if user copied path with quotes
    if root_folder.startswith('"') and root_folder.endswith('"'):
        root_folder = root_folder[1:-1]
    
    # Validate input path exists
    if not os.path.exists(root_folder):
        print(f"Error: Path '{root_folder}' does not exist!")
        exit(1)
    
    # Get output folder path (optional)
    output_folder = input("Enter output folder path (or press Enter for default): ").strip()
    if output_folder.startswith('"') and output_folder.endswith('"'):
        output_folder = output_folder[1:-1]
    
    if not output_folder:
        output_folder = f"{root_folder}_extracted"
    
    print(f"Input folder: {root_folder}")
    print(f"Output folder: {output_folder}")
    print("=" * 60)
    
    # Initialize the ingester
    ingester = PDFIngesterHierarchical(root_folder, output_folder)
    
    # Ingest all PDFs
    ingester.ingest_all()
    
    print("\n" + "="*60)
    print("OUTPUT STRUCTURE:")
    print("="*60)
    print(f"📁 {output_folder}/")
    print("├── 📄 text_files/")
    print("│   ├── 📁 root/")
    print("│   └── 📁 colleges/")
    print("│       └── 📁 [college_name]/")
    print("│           └── 📁 [year]/")
    print("├── 📊 metadata/")
    print("│   ├── 📁 root/")
    print("│   └── 📁 colleges/")
    print("├── 📈 reports/")
    print("│   ├── 📄 master_summary.csv")
    print("│   └── 📄 detailed_report.json")
    print("└── 📝 logs/")
    print("    └── 📄 processing_log.txt")
    
    print("\n" + "="*60)
    print("PROCESSING COMPLETE!")
    print(f"✓ All extracted text files saved to: {output_folder}/text_files/")
    print(f"✓ All metadata files saved to: {output_folder}/metadata/")
    print(f"✓ Summary reports saved to: {output_folder}/reports/")
    print(f"✓ Processing logs saved to: {output_folder}/logs/")
    print("="*60)
    
    # Optional: Interactive search
    while True:
        search_term = input("\nEnter search term (or 'quit' to exit): ").strip()
        if search_term.lower() in ['quit', 'exit', 'q', '']:
            break
            
        search_results = ingester.search_content(search_term)
        if search_results:
            print(f"\n🔍 Found '{search_term}' in {len(search_results)} files:")
            for result in search_results:
                if result['college']:
                    if result['year']:
                        location = f"{result['college']}/{result['year']}"
                    else:
                        location = f"{result['college']} (college level)"
                else:
                    location = "Root folder"
                print(f"  ✓ {result['file_name']} ({location})")
        else:
            print(f"❌ No results found for '{search_term}'")

PDF Ingestion Tool - Hierarchical Output Version


Enter the full path to your input folder:  "C:\Users\siddhu\Desktop\NIRF(151 to 200)"
Enter output folder path (or press Enter for default):  


Input folder: C:\Users\siddhu\Desktop\NIRF(151 to 200)
Output folder: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted
Creating output directory structure at: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted
✓ Output structure created at: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted
Starting PDF ingestion from: C:\Users\siddhu\Desktop\NIRF(151 to 200)
Output will be saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted
Ingesting PDFs from root folder...

Ingesting PDFs from college folders and year subfolders...
  Processing college: Annamalai University
    Processing year folder: 2017
    Extracting text from: College 2017.pdf
      Best method: pdfplumber
      ✓ Extracted 164055 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Annamalai University\2017\College 2017.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Annamalai University\2017\College 2017_metadata

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value


      Best method: pdfplumber
      ✓ Extracted 57864 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Dayalbagh Educational Institute\2018\Engineering 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Dayalbagh Educational Institute\2018\Engineering 2018_metadata.json
      ✓ Year PDF: Engineering 2018.pdf
    Extracting text from: Overall 2018.pdf
      Best method: pdfplumber
      ✓ Extracted 30314 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Dayalbagh Educational Institute\2018\Overall 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Dayalbagh Educational Institute\2018\Overall 2018_metadata.json
      ✓ Year PDF: Overall 2018.pdf
    Processing year folder: 2019
    Extracting text from: Engineering 2019.pdf
      Best method: pdfplumber
      ✓ Extracted 3

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


      Best method: pdfplumber
      ✓ Extracted 133052 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Dayalbagh Educational Institute\2024\Overall 2024.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Dayalbagh Educational Institute\2024\Overall 2024_metadata.json
      ✓ Year PDF: Overall 2024.pdf
    Processing year folder: 2025
    Extracting text from: Engineering 2025.pdf
      Best method: pdfplumber
      ✓ Extracted 36083 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Dayalbagh Educational Institute\2025\Engineering 2025.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Dayalbagh Educational Institute\2025\Engineering 2025_metadata.json
      ✓ Year PDF: Engineering 2025.pdf
    Extracting text from: Management 2025.pdf
      Best method: pdfplumber
      ✓ Extract

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value


      Best method: pdfplumber
      ✓ Extracted 66016 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\J. C. Bose University of Science and Technology, YMCA\Engineering 2019.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\J. C. Bose University of Science and Technology, YMCA\Engineering 2019_metadata.json
    ✓ College PDF: Engineering 2019.pdf
    Extracting text from: Engineering 2023.pdf
      Best method: pdfplumber
      ✓ Extracted 47566 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\J. C. Bose University of Science and Technology, YMCA\Engineering 2023.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\J. C. Bose University of Science and Technology, YMCA\Engineering 2023_metadata.json
    ✓ College PDF: Engineering 2023.pdf
    Processing year folder: 2020
    Extrac

Cannot set gray non-stroke color because /'R18' is an invalid float value
Cannot set gray non-stroke color because /'R36' is an invalid float value
Cannot set gray non-stroke color because /'R39' is an invalid float value
Cannot set gray non-stroke color because /'R42' is an invalid float value
Cannot set gray non-stroke color because /'R44' is an invalid float value
Cannot set gray non-stroke color because /'R90' is an invalid float value


      Best method: pdfplumber
      ✓ Extracted 5327 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\National Engineering College\2018\Engineering 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\National Engineering College\2018\Engineering 2018_metadata.json
      ✓ Year PDF: Engineering 2018.pdf
    Extracting text from: Overall 2018.pdf
      Best method: PyPDF2
      ✓ Extracted 142 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\National Engineering College\2018\Overall 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\National Engineering College\2018\Overall 2018_metadata.json
      ✓ Year PDF: Overall 2018.pdf
    Processing year folder: 2019
    Extracting text from: Engineering 2019.pdf
      Best method: pdfplumber
      ✓ Extracted 62175 characters
   

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value


      Best method: pdfplumber
      ✓ Extracted 106938 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\NMAM Institute of Technology\2018\Engineering 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\NMAM Institute of Technology\2018\Engineering 2018_metadata.json
      ✓ Year PDF: Engineering 2018.pdf
    Extracting text from: Management 2018.pdf


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value


      Best method: pdfplumber
      ✓ Extracted 24171 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\NMAM Institute of Technology\2018\Management 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\NMAM Institute of Technology\2018\Management 2018_metadata.json
      ✓ Year PDF: Management 2018.pdf
    Extracting text from: Overall  2018.pdf
      Best method: pdfplumber
      ✓ Extracted 128402 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\NMAM Institute of Technology\2018\Overall  2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\NMAM Institute of Technology\2018\Overall  2018_metadata.json
      ✓ Year PDF: Overall  2018.pdf
    Processing year folder: 2019
    Extracting text from: Engineering 2019.pdf
      Best method: pdfplumber
      ✓ Extracted 107718 char

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


      Best method: pdfplumber
      ✓ Extracted 114251 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Sri Krishna College of Technology\2025\Innovation 2025.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Sri Krishna College of Technology\2025\Innovation 2025_metadata.json
      ✓ Year PDF: Innovation 2025.pdf
    Extracting text from: Management 2025.pdf


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


      Best method: pdfplumber
      ✓ Extracted 86131 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Sri Krishna College of Technology\2025\Management 2025.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Sri Krishna College of Technology\2025\Management 2025_metadata.json
      ✓ Year PDF: Management 2025.pdf
    Extracting text from: Overall 2025.pdf


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


      Best method: pdfplumber
      ✓ Extracted 80199 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Sri Krishna College of Technology\2025\Overall 2025.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Sri Krishna College of Technology\2025\Overall 2025_metadata.json
      ✓ Year PDF: Overall 2025.pdf
    Extracting text from: SDG 2025.pdf
      Best method: pdfplumber
      ✓ Extracted 82584 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Sri Krishna College of Technology\2025\SDG 2025.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Sri Krishna College of Technology\2025\SDG 2025_metadata.json
      ✓ Year PDF: SDG 2025.pdf
    College 'Sri Krishna College of Technology': 0 PDFs, 1 year folders
  Processing college: Sri Ramakrishna Engineering College
    Extracting text

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value


      Best method: pdfplumber
      ✓ Extracted 152597 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Vallurupalli Nageswara Rao Vignana Jyothi Institute of Engineering and Technology\2018\Engineering 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Vallurupalli Nageswara Rao Vignana Jyothi Institute of Engineering and Technology\2018\Engineering 2018_metadata.json
      ✓ Year PDF: Engineering 2018.pdf
    Extracting text from: Overall 2018.pdf
      Best method: pdfplumber
      ✓ Extracted 154375 characters
      ✓ Text saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\text_files\colleges\Vallurupalli Nageswara Rao Vignana Jyothi Institute of Engineering and Technology\2018\Overall 2018.txt
      ✓ Metadata saved to: C:\Users\siddhu\Desktop\NIRF(151 to 200)_extracted\metadata\colleges\Vallurupalli Nageswara Rao Vignana Jyothi Institute of Engineering and Techn


Enter search term (or 'quit' to exit):  exit
