In [27]:
import os
import re
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import hashlib

class EfficientTextProcessor:
    def __init__(self, input_folder, output_folder=None):
        self.input_path = Path(input_folder)
        self.output_path = Path(output_folder) if output_folder else Path(f"{input_folder}_processed")
        self.stats = {'files': 0, 'duplicates': 0, 'chunks': 0}
        self.results = []
        
        # Combined patterns for efficiency
        self.section_patterns = [
            (r'(?i)(student|admission|enrollment)', 'Student Information'),
            (r'(?i)(placement|job|career)', 'Placement & Career'),
            (r'(?i)(faculty|staff|teacher)', 'Faculty Information'),
            (r'(?i)(infrastructure|facility|building)', 'Infrastructure'),
            (r'(?i)(financial|fee|budget)', 'Financial Information'),
            (r'(?i)(academic|curriculum|course)', 'Academic Information'),
            (r'(?i)(research|publication)', 'Research'),
            (r'(?i)(library|books)', 'Library'),
            (r'(?i)(laboratory|lab)', 'Laboratory'),
            (r'(?i)(examination|exam|result)', 'Examination'),
            (r'(?i)(quality|accreditation)', 'Quality Assurance')
        ]
        
        self.cleanup_patterns = [
            (r'file\s+content\s+(begin|end)', ''),
            (r'page\s+\d+(\s+of\s+\d+)?', ''),
            (r'generated\s+on.*?\d{4}', ''),
            (r'copyright.*?\d{4}', ''),
            (r'^\s*[|\-=]+\s*$', ''),
            (r'(\d+)\s*\(\s*([^)]+)\s*\)', r'\1'),  # Remove word numbers in parentheses
            (r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\b', lambda m: str(self.word_to_num(m.group()))),
        ]
        
        self.word_nums = {'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 
                         'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'}

    def word_to_num(self, word):
        return self.word_nums.get(word.lower(), word)

    def clean_text(self, text):
        """Apply all cleaning operations in one pass"""
        for pattern, replacement in self.cleanup_patterns:
            if callable(replacement):
                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
            else:
                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        # Remove duplicates by line hashing
        lines = text.split('\n')
        seen_hashes = set()
        unique_lines = []
        
        for line in lines:
            line = line.strip()
            if line:
                line_hash = hashlib.md5(line.encode()).hexdigest()
                if line_hash not in seen_hashes:
                    unique_lines.append(line)
                    seen_hashes.add(line_hash)
                else:
                    self.stats['duplicates'] += 1
        
        return '\n'.join(unique_lines)

    def create_chunks(self, text, filename):
        """Create chunks based on section patterns or size"""
        chunks = []
        lines = text.split('\n')
        current_section = 'General'
        current_content = []
        
        for line in lines:
            # Check for section headers
            section_found = None
            for pattern, section_name in self.section_patterns:
                if re.search(pattern, line):
                    section_found = section_name
                    break
            
            if section_found and current_content:
                # Save previous section
                content = '\n'.join(current_content)
                chunks.append({
                    'file': filename,
                    'section': current_section,
                    'content': content,
                    'words': len(content.split()),
                    'chars': len(content)
                })
                current_content = []
                current_section = section_found
            
            current_content.append(line)
        
        # Save last section
        if current_content:
            content = '\n'.join(current_content)
            chunks.append({
                'file': filename,
                'section': current_section,
                'content': content,
                'words': len(content.split()),
                'chars': len(content)
            })
        
        # If no sections found, create size-based chunks
        if len(chunks) == 1 and chunks[0]['section'] == 'General':
            chunks = self.size_based_chunks(text, filename)
        
        self.stats['chunks'] += len(chunks)
        return chunks

    def size_based_chunks(self, text, filename, max_size=2000):
        """Create size-based chunks"""
        chunks = []
        words = text.split()
        chunk_words = []
        chunk_num = 1
        
        for word in words:
            chunk_words.append(word)
            if len(' '.join(chunk_words)) > max_size:
                content = ' '.join(chunk_words[:-1])
                chunks.append({
                    'file': filename,
                    'section': f'Part {chunk_num}',
                    'content': content,
                    'words': len(chunk_words) - 1,
                    'chars': len(content)
                })
                chunk_words = [word]
                chunk_num += 1
        
        if chunk_words:
            content = ' '.join(chunk_words)
            chunks.append({
                'file': filename,
                'section': f'Part {chunk_num}',
                'content': content,
                'words': len(chunk_words),
                'chars': len(content)
            })
        
        return chunks

    def preserve_structure(self, file_path):
        """Maintain original folder structure in output"""
        relative_path = file_path.relative_to(self.input_path)
        output_file_path = self.output_path / relative_path.parent
        output_file_path.mkdir(parents=True, exist_ok=True)
        return output_file_path

    def process_file(self, file_path):
        """Process single file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            if not text.strip():
                return None
            
            # Clean text
            cleaned_text = self.clean_text(text)
            
            # Create output directory maintaining structure
            output_dir = self.preserve_structure(file_path)
            
            # Save cleaned text
            cleaned_file = output_dir / f"{file_path.stem}_cleaned.txt"
            with open(cleaned_file, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
            
            # Create chunks
            chunks = self.create_chunks(cleaned_text, file_path.name)
            
            # Save chunks in chunks subfolder
            chunks_dir = output_dir / "chunks"
            chunks_dir.mkdir(exist_ok=True)
            
            for i, chunk in enumerate(chunks):
                chunk_file = chunks_dir / f"{file_path.stem}_chunk_{i+1}_{chunk['section'].replace(' ', '_')}.txt"
                with open(chunk_file, 'w', encoding='utf-8') as f:
                    f.write(f"File: {chunk['file']}\nSection: {chunk['section']}\n")
                    f.write(f"Words: {chunk['words']} | Chars: {chunk['chars']}\n")
                    f.write("-" * 50 + "\n")
                    f.write(chunk['content'])
            
            self.stats['files'] += 1
            return {
                'file': file_path.name,
                'path': str(file_path.relative_to(self.input_path)),
                'original_size': len(text),
                'cleaned_size': len(cleaned_text),
                'chunks': len(chunks),
                'chunks_data': chunks
            }
            
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None

    def process_all(self):
        """Process all files maintaining folder structure"""
        print(f"Processing files from: {self.input_path}")
        print(f"Output to: {self.output_path}")
        
        txt_files = list(self.input_path.rglob("*.txt"))
        if not txt_files:
            print("No .txt files found!")
            return
        
        print(f"Found {len(txt_files)} files")
        
        for file_path in txt_files:
            print(f"Processing: {file_path.relative_to(self.input_path)}")
            result = self.process_file(file_path)
            if result:
                self.results.append(result)
        
        self.generate_reports()
        
        print(f"\nCompleted!")
        print(f"Files processed: {self.stats['files']}")
        print(f"Duplicates removed: {self.stats['duplicates']}")
        print(f"Chunks created: {self.stats['chunks']}")

    def generate_reports(self):
        """Generate summary reports"""
        if not self.results:
            return
        
        # Create reports directory
        reports_dir = self.output_path / "reports"
        reports_dir.mkdir(exist_ok=True)
        
        # Summary CSV
        summary_data = [{
            'file': r['file'],
            'path': r['path'],
            'original_size': r['original_size'],
            'cleaned_size': r['cleaned_size'],
            'reduction': f"{(1 - r['cleaned_size']/r['original_size'])*100:.1f}%",
            'chunks': r['chunks']
        } for r in self.results]
        
        pd.DataFrame(summary_data).to_csv(reports_dir / "summary.csv", index=False)
        
        # All chunks CSV
        all_chunks = []
        for r in self.results:
            for chunk in r['chunks_data']:
                all_chunks.append({
                    'file_path': r['path'],
                    'file_name': chunk['file'],
                    'section': chunk['section'],
                    'word_count': chunk['words'],
                    'char_count': chunk['chars']
                })
        
        pd.DataFrame(all_chunks).to_csv(reports_dir / "chunks.csv", index=False)
        
        # JSON report
        report = {
            'timestamp': datetime.now().isoformat(),
            'statistics': self.stats,
            'files': self.results
        }
        
        with open(reports_dir / "report.json", 'w') as f:
            json.dump(report, f, indent=2, default=str)
        
        print(f"Reports saved to: {reports_dir}")

    def search(self, term, case_sensitive=False):
        """Search across all processed content"""
        results = []
        for file_result in self.results:
            for chunk in file_result['chunks_data']:
                content = chunk['content'] if case_sensitive else chunk['content'].lower()
                search_term = term if case_sensitive else term.lower()
                
                if search_term in content:
                    # Get context around match
                    idx = content.find(search_term)
                    start = max(0, idx - 50)
                    end = min(len(content), idx + len(search_term) + 50)
                    context = content[start:end]
                    
                    results.append({
                        'file': file_result['path'],
                        'section': chunk['section'],
                        'context': context
                    })
        
        return results


# Simple usage
if __name__ == "__main__":
    input_folder = input("Enter input folder path: ").strip().strip('"')
    output_folder = input("Enter output folder path (optional): ").strip().strip('"')
    
    if not os.path.exists(input_folder):
        print(f"Path '{input_folder}' does not exist!")
        exit(1)
    
    processor = EfficientTextProcessor(input_folder, output_folder if output_folder else None)
    processor.process_all()
    
    # Interactive search
    while True:
        search_term = input("\nSearch term (or 'quit'): ").strip()
        if search_term.lower() in ['quit', 'q', '']:
            break
        
        results = processor.search(search_term)
        if results:
            print(f"Found {len(results)} matches:")
            for r in results[:3]:  # Show first 3
                print(f"  {r['file']} - {r['section']}")
                print(f"  ...{r['context'][:100]}...")
        else:
            print("No matches found")

Enter input folder path:  "C:\Users\siddhu\Desktop\text_files (201 to 300)"
Enter output folder path (optional):  


Processing files from: C:\Users\siddhu\Desktop\text_files (201 to 300)
Output to: C:\Users\siddhu\Desktop\text_files (201 to 300)_processed
Found 777 files
Processing: colleges\ABES Engineering College\ENGINEERING 2020.txt
Processing: colleges\ABES Engineering College\ENGINEERING 2021.txt
Processing: colleges\ABES Engineering College\ENGINEERING 2023.txt
Processing: colleges\ABES Engineering College\ENGINEERING 2024.txt
Processing: colleges\Army Institute of Technology\Engineering 2018.txt
Processing: colleges\Army Institute of Technology\Engineering 2019.txt
Processing: colleges\Army Institute of Technology\Engineering 2020.txt
Processing: colleges\Army Institute of Technology\Engineering 2021.txt
Processing: colleges\Army Institute of Technology\Engineering 2022.txt
Processing: colleges\Army Institute of Technology\Engineering 2023.txt
Processing: colleges\Bharati Vidyapeeths College of Engineering New delhi\Engineering 2023.txt
Processing: colleges\Bharati Vidyapeeths College of Eng


Search term (or 'quit'):  quit
