# Organising the files

In [1]:
import os
import re
import shutil
from pathlib import Path
from collections import defaultdict

class GerManCOrganizer:
    def __init__(self, source_dir, output_dir):
        self.source_dir = Path(source_dir)
        self.output_dir = Path(output_dir)
        
        # Genre mapping from filename prefixes
        self.genres = {
            'DRAM': 'Drama',
            'HUMA': 'Humanities', 
            'LEGA': 'Legal',
            'NARR': 'Narrative',
            'NEWS': 'Newspapers',
            'SCIE': 'Scientific',
            'SERM': 'Sermons'
        }
        
        # Time period bins
        self.periods = {
            'P1': '1650-1700',
            'P2': '1700-1750', 
            'P3': '1750-1800'
        }
        
    def extract_file_info(self, filename):
        """Extract genre, period, and year from filename"""
        # Pattern: GENRE_PERIOD_REGION_YEAR_TITLE.xml
        # Example: DRAM_P1_NoD_1673_Leonilda.xml
        
        pattern = r'([A-Z]{4})_([P][1-3])_([A-Za-z]+)_(\d{4})_(.+)\.xml'
        match = re.match(pattern, filename)
        
        if not match:
            return None
            
        genre_code, period_code, region, year, title = match.groups()
        
        return {
            'genre': self.genres.get(genre_code, 'Unknown'),
            'genre_code': genre_code,
            'period': self.periods.get(period_code, 'Unknown'),
            'period_code': period_code,
            'region': region,
            'year': int(year),
            'title': title,
            'filename': filename
        }
    
    def organize_files(self):
        """Main function to organize all LING-GATE files"""
        
        # Create output directory structure
        self.create_directory_structure()
        
        # Stats tracking
        stats = defaultdict(lambda: defaultdict(int))
        processed_files = []
        error_files = []
        
        # Process each XML file
        for xml_file in self.source_dir.glob("*.xml"):
            try:
                file_info = self.extract_file_info(xml_file.name)
                
                if file_info:
                    # Create destination path
                    dest_dir = self.output_dir / file_info['period'] / file_info['genre']
                    dest_dir.mkdir(parents=True, exist_ok=True)
                    
                    # Copy file to organized location
                    dest_path = dest_dir / xml_file.name
                    shutil.copy2(xml_file, dest_path)
                    
                    # Update stats
                    stats[file_info['period']][file_info['genre']] += 1
                    processed_files.append(file_info)
                    
                    print(f"✓ {xml_file.name} → {file_info['period']}/{file_info['genre']}")
                    
                else:
                    error_files.append(xml_file.name)
                    print(f"✗ Could not parse: {xml_file.name}")
                    
            except Exception as e:
                error_files.append(xml_file.name)
                print(f"✗ Error processing {xml_file.name}: {e}")
        
        # Generate summary report
        self.generate_report(stats, processed_files, error_files)
        
        return stats, processed_files, error_files
    
    def create_directory_structure(self):
        """Create organized directory structure"""
        for period in self.periods.values():
            for genre in self.genres.values():
                dir_path = self.output_dir / period / genre
                dir_path.mkdir(parents=True, exist_ok=True)
                
    def generate_report(self, stats, processed_files, error_files):
        """Generate organization summary report"""
        
        report_path = self.output_dir / "organization_report.txt"
        
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("GerManC Corpus Organization Report\n")
            f.write("=" * 40 + "\n\n")
            
            # Summary statistics
            f.write(f"Total files processed: {len(processed_files)}\n")
            f.write(f"Total files with errors: {len(error_files)}\n\n")
            
            # Files by period and genre
            f.write("Files by Period and Genre:\n")
            f.write("-" * 30 + "\n")
            
            for period in sorted(stats.keys()):
                f.write(f"\n{period}:\n")
                for genre in sorted(stats[period].keys()):
                    count = stats[period][genre]
                    f.write(f"  {genre}: {count} files\n")
            
            # Year distribution
            f.write("\n\nYear Distribution:\n")
            f.write("-" * 20 + "\n")
            years = [info['year'] for info in processed_files]
            if years:
                f.write(f"Earliest: {min(years)}\n")
                f.write(f"Latest: {max(years)}\n")
                f.write(f"Range: {max(years) - min(years)} years\n")
            
            # Error files
            if error_files:
                f.write("\n\nFiles with errors:\n")
                f.write("-" * 20 + "\n")
                for error_file in error_files:
                    f.write(f"  {error_file}\n")
        
        print(f"\n📊 Report saved to: {report_path}")

def main():
    """Example usage"""
    
    # Set your paths here
    source_directory = "/Users/rohan/Downloads/2544/LING-GATE/"  # Where your XML files are
    output_directory = "/Users/rohan/Downloads/2544/organized_germanc"                # Where to create organized structure
    
    # Create organizer and run
    organizer = GerManCOrganizer(source_directory, output_directory)
    stats, processed, errors = organizer.organize_files()
    
    # Print summary
    print(f"\n🎉 Organization complete!")
    print(f"📁 Organized {len(processed)} files")
    print(f"❌ {len(errors)} errors")
    print(f"📊 Check organization_report.txt for details")

if __name__ == "__main__":
    main()

✓ SERM_P1_WOD_1660_LeichPredigt.xml → 1650-1700/Sermons
✓ SERM_P3_OOD_1792_Sonntagen.xml → 1750-1800/Sermons
✓ LEGA_P1_WOD_1654_HoffgerichtsOrdnung.xml → 1650-1700/Legal
✓ SCIE_P2_WOD_1741_Erden.xml → 1700-1750/Scientific
✓ HUMA_P1_NoD_1667_Ratseburg.xml → 1650-1700/Humanities
✓ LEGA_P3_NoD_1751_FeuerOrdnung.xml → 1750-1800/Legal
✓ DRAM_P1_OOD_1675_Pirrus.xml → 1650-1700/Drama
✓ SCIE_P1_OOD_1681_CometenGespoetts.xml → 1650-1700/Scientific
✓ NARR_P1_OMD_1671_Ruebezahl.xml → 1650-1700/Narrative
✓ HUMA_P1_WMD_1692_Christus.xml → 1650-1700/Humanities
✓ HUMA_P3_WMD_1772_Baukunst.xml → 1750-1800/Humanities
✓ SERM_P2_OMD_1715_Beerdigung.xml → 1700-1750/Sermons
✓ SCIE_P3_NoD_1799_Gasarten.xml → 1750-1800/Scientific
✓ SCIE_P2_WMD_1744_SelbstArtzt.xml → 1700-1750/Scientific
✓ SERM_P3_OMD_1760_Folgen.xml → 1750-1800/Sermons
✓ SERM_P3_WOD_1792_Hegel.xml → 1750-1800/Sermons
✓ NEWS_P2_OOD_1702_muenchen2.xml → 1700-1750/Newspapers
✓ DRAM_P2_WOD_1748_Hoelle.xml → 1700-1750/Drama
✓ NARR_P2_NoD_1715_Afr

# Preprocessing 

In [1]:
#!/usr/bin/env python3
"""
GerManC GATE XML Preprocessor - Fixed Version
Properly handles GATE XML format with TextWithNodes and Annotation elements.
"""

import xml.etree.ElementTree as ET
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict
import re
from tqdm import tqdm
import logging
from datetime import datetime

class GateXMLPreprocessor:
    def __init__(self, organized_dir, output_dir):
        self.organized_dir = Path(organized_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Initialize data containers
        self.documents = []
        self.tokens = []
        self.linguistic_features = defaultdict(list)
        
        # Initialize logging
        self.setup_logging()
        self.stats = {
            'total_files': 0,
            'processed': 0,
            'xml_errors': 0,
            'processing_errors': 0,
            'empty_files': 0
        }
    
    def setup_logging(self):
        """Setup logging for error tracking"""
        log_file = self.output_dir / f"processing_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file, encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def process_all_files(self):
        """Process all organized XML files"""
        # Collect all XML files
        xml_files = []
        for period_dir in self.organized_dir.iterdir():
            if period_dir.is_dir():
                period = period_dir.name
                for genre_dir in period_dir.iterdir():
                    if genre_dir.is_dir():
                        genre = genre_dir.name
                        for xml_file in genre_dir.glob("*.xml"):
                            xml_files.append((xml_file, period, genre))
        
        self.stats['total_files'] = len(xml_files)
        
        # Process with progress bar
        with tqdm(xml_files, desc="Processing GATE XML files", unit="file") as pbar:
            for xml_file, period, genre in pbar:
                pbar.set_postfix(file=xml_file.name[:30])
                
                try:
                    doc_data = self.process_gate_xml(xml_file, period, genre)
                    if doc_data and doc_data['tokens']:
                        self.documents.append(doc_data)
                        self.stats['processed'] += 1
                        self.logger.info(f"✓ Processed: {xml_file.name} ({len(doc_data['tokens'])} tokens)")
                    else:
                        self.stats['empty_files'] += 1
                        self.logger.warning(f"No tokens found in: {xml_file.name}")
                        
                except ET.ParseError as e:
                    self.stats['xml_errors'] += 1
                    self.logger.error(f"XML Error in {xml_file.name}: {e}")
                    continue
                    
                except Exception as e:
                    self.stats['processing_errors'] += 1
                    self.logger.error(f"Processing Error in {xml_file.name}: {e}")
                    continue
        
        # Save processed data
        self.save_processed_data()
        
        # Print summary
        self.print_summary()
        
        return self.stats
    
    def process_gate_xml(self, xml_path, period, genre):
        """Process GATE XML file with TextWithNodes structure"""
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()
            
            # Extract document metadata
            doc_id = xml_path.stem
            doc_data = {
                'doc_id': doc_id,
                'period': period,
                'genre': genre,
                'filename': xml_path.name,
                'year': self.extract_year(xml_path.name),
                'region': self.extract_region(xml_path.name),
                'text': '',
                'tokens': [],
                'sentences': [],
                'token_count': 0,
                'unique_words': set()
            }
            
            # Extract text with nodes
            text_with_nodes = root.find('.//TextWithNodes')
            if text_with_nodes is None:
                self.logger.warning(f"No TextWithNodes found in {xml_path.name}")
                return None
            
            # Parse text and node positions
            text_content, node_positions = self.parse_text_with_nodes(text_with_nodes)
            doc_data['text'] = text_content
            
            # Extract annotations
            annotation_set = root.find('.//AnnotationSet')
            annotations = {}
            if annotation_set is not None:
                annotations = self.parse_annotations(annotation_set)
            
            # Process tokens using annotations and text
            tokens, sentences = self.extract_tokens_and_sentences(
                text_content, node_positions, annotations, doc_id, period, genre
            )
            
            doc_data['tokens'] = tokens
            doc_data['sentences'] = sentences
            doc_data['token_count'] = len(tokens)
            doc_data['unique_words'] = len(set(token['normalized'].lower() for token in tokens if token['normalized']))
            
            # Add tokens to global list
            self.tokens.extend(tokens)
            
            return doc_data
            
        except Exception as e:
            self.logger.error(f"Error processing {xml_path.name}: {e}")
            raise
    
    def parse_text_with_nodes(self, text_with_nodes_elem):
        """Parse TextWithNodes element to extract text and node positions"""
        text_parts = []
        node_positions = {}
        
        # Process all text and node elements
        for elem in text_with_nodes_elem:
            if elem.tag == 'Node':
                node_id = elem.get('id')
                if node_id:
                    current_pos = len(''.join(text_parts))
                    node_positions[int(node_id)] = current_pos
            elif elem.text:
                text_parts.append(elem.text)
            
            # Also handle tail text after nodes
            if elem.tail:
                text_parts.append(elem.tail)
        
        # Handle text directly in TextWithNodes
        if text_with_nodes_elem.text:
            text_parts.insert(0, text_with_nodes_elem.text)
        
        full_text = ''.join(text_parts)
        return full_text, node_positions
    
    def parse_annotations(self, annotation_set):
        """Parse annotation elements to extract linguistic features"""
        annotations = {}
        
        for annotation in annotation_set.findall('Annotation'):
            ann_id = annotation.get('Id')
            ann_type = annotation.get('Type')
            start_node = annotation.get('StartNode')
            end_node = annotation.get('EndNode')
            
            if not all([ann_id, ann_type, start_node, end_node]):
                continue
            
            # Extract features
            features = {}
            for feature in annotation.findall('Feature'):
                name_elem = feature.find('Name')
                value_elem = feature.find('Value')
                
                if name_elem is not None and value_elem is not None:
                    feature_name = name_elem.text
                    feature_value = value_elem.text
                    features[feature_name] = feature_value
            
            annotations[ann_id] = {
                'type': ann_type,
                'start_node': int(start_node),
                'end_node': int(end_node),
                'features': features
            }
        
        return annotations
    
    def extract_tokens_and_sentences(self, text, node_positions, annotations, doc_id, period, genre):
        """Extract tokens and sentences from text and annotations"""
        tokens = []
        sentences = []
        
        # Sort annotations by start position
        token_annotations = []
        for ann_id, ann in annotations.items():
            if ann['type'] == 'Token':
                token_annotations.append((ann['start_node'], ann['end_node'], ann['features'], ann_id))
        
        token_annotations.sort(key=lambda x: x[0])  # Sort by start node
        
        current_sentence = []
        sentence_id = 0
        token_id = 0
        
        for start_node, end_node, features, ann_id in token_annotations:
            # Get text span using node positions
            start_pos = node_positions.get(start_node, 0)
            end_pos = node_positions.get(end_node, len(text))
            
            if start_pos < len(text) and end_pos <= len(text) and start_pos < end_pos:
                token_text = text[start_pos:end_pos].strip()
                
                if not token_text:
                    continue
                
                # Extract token features
                token_data = self.create_token_data(
                    token_text, features, doc_id, sentence_id, token_id, period, genre
                )
                
                if token_data:
                    tokens.append(token_data)
                    current_sentence.append(token_data)
                    token_id += 1
                    
                    # Check for sentence boundaries (simple heuristic)
                    if self.is_sentence_end(token_text):
                        if current_sentence:
                            sentence_text = ' '.join(t['original'] for t in current_sentence)
                            sentences.append({
                                'sentence_id': sentence_id,
                                'text': sentence_text,
                                'tokens': current_sentence.copy(),
                                'token_count': len(current_sentence)
                            })
                            current_sentence = []
                            sentence_id += 1
        
        # Handle remaining tokens as final sentence
        if current_sentence:
            sentence_text = ' '.join(t['original'] for t in current_sentence)
            sentences.append({
                'sentence_id': sentence_id,
                'text': sentence_text,
                'tokens': current_sentence,
                'token_count': len(current_sentence)
            })
        
        return tokens, sentences
    
    def create_token_data(self, token_text, features, doc_id, sent_id, token_id, period, genre):
        """Create token data structure from text and features"""
        
        # Get linguistic features
        original = features.get('string', token_text)
        normalized = features.get('norm', original)
        lemma = features.get('lemma', normalized)
        pos = features.get('pos', '')
        morph = features.get('morph', '')
        
        # Skip very short or empty tokens
        if len(original.strip()) == 0:
            return None
        
        token_data = {
            'doc_id': doc_id,
            'sentence_id': sent_id,
            'token_id': token_id,
            'period': period,
            'genre': genre,
            'original': original.strip(),
            'normalized': normalized.strip() if normalized else original.strip(),
            'lemma': lemma.strip() if lemma else normalized.strip() if normalized else original.strip(),
            'pos': pos,
            'morphology': morph,
            'is_spelling_variant': original.lower() != normalized.lower() if normalized else False,
            'word_length': len(original.strip()),
            'has_archaic_spelling': self.is_archaic_spelling(original),
            'is_punctuation': self.is_punctuation(original)
        }
        
        # Track linguistic changes
        if token_data['is_spelling_variant']:
            self.linguistic_features['spelling_variants'].append({
                'original': original,
                'normalized': normalized,
                'lemma': lemma,
                'period': period,
                'genre': genre,
                'pos': pos
            })
        
        return token_data
    
    def is_sentence_end(self, token):
        """Simple sentence boundary detection"""
        return token.strip() in '.!?;'
    
    def is_punctuation(self, token):
        """Check if token is punctuation"""
        return all(c in '.,!?;:()[]{}"\'-' for c in token.strip())
    
    def is_archaic_spelling(self, word):
        """Identify archaic spelling patterns in Early New High German"""
        if not word or len(word) < 2:
            return False
            
        word_lower = word.lower()
        
        # Common archaic patterns in Early New High German
        archaic_patterns = [
            r'.*uo.*',      # uo diphthong (guot -> gut)
            r'.*ie.*',      # ie for long i (liebe)
            r'.*ey.*',      # ey/ei variations
            r'.*ck$',       # archaic endings
            r'^v[aeiou]',   # v- beginnings (vmb -> um)
            r'.*th.*',      # th spellings (thun -> tun)
            r'.*umb$',      # umb endings (vmb -> um)
            r'.*tz$',       # tz endings
            r'.*ff.*',      # double f
            r'.*ss.*',      # double s patterns
            r'.*ů.*',       # archaic u with circle
            r'.*ä.*',       # archaic a-umlaut forms
            r'.*ö.*',       # archaic o-umlaut forms
            r'.*ü.*',       # archaic u-umlaut forms
        ]
        
        return any(re.search(pattern, word_lower) for pattern in archaic_patterns)
    
    def extract_year(self, filename):
        """Extract year from filename"""
        match = re.search(r'_(\d{4})_', filename)
        return int(match.group(1)) if match else None
    
    def extract_region(self, filename):
        """Extract region code from filename"""
        match = re.search(r'_([A-Za-z]+)_\d{4}_', filename)
        return match.group(1) if match else None
    
    def save_processed_data(self):
        """Save all processed data in multiple formats"""
        
        # 1. Save documents metadata as JSON
        docs_file = self.output_dir / "documents.json"
        with open(docs_file, 'w', encoding='utf-8') as f:
            # Convert sets to lists for JSON serialization
            docs_for_json = []
            for doc in self.documents:
                doc_copy = doc.copy()
                if 'unique_words' in doc_copy and isinstance(doc_copy['unique_words'], set):
                    doc_copy['unique_words'] = list(doc_copy['unique_words'])
                docs_for_json.append(doc_copy)
            json.dump(docs_for_json, f, indent=2, ensure_ascii=False)
        
        # 2. Save tokens as CSV for analysis
        if self.tokens:
            tokens_df = pd.DataFrame(self.tokens)
            tokens_file = self.output_dir / "tokens.csv"
            tokens_df.to_csv(tokens_file, index=False, encoding='utf-8')
        
        # 3. Save linguistic features
        features_file = self.output_dir / "linguistic_features.json"
        with open(features_file, 'w', encoding='utf-8') as f:
            json.dump(dict(self.linguistic_features), f, indent=2, ensure_ascii=False)
        
        # 4. Save summary statistics
        self.save_statistics()
        
        print(f"📁 Saved to {self.output_dir}/")
        print(f"   - documents.json ({len(self.documents)} docs)")
        print(f"   - tokens.csv ({len(self.tokens)} tokens)")
        print(f"   - linguistic_features.json")
        print(f"   - statistics.json")
    
    def save_statistics(self):
        """Generate and save corpus statistics"""
        
        if not self.documents:
            return
        
        # Period distribution
        period_stats = defaultdict(int)
        for doc in self.documents:
            period_stats[doc['period']] += 1
        
        # Genre distribution
        genre_stats = defaultdict(int)
        for doc in self.documents:
            genre_stats[doc['genre']] += 1
        
        # Year distribution
        year_stats = defaultdict(int)
        for doc in self.documents:
            if doc.get('year'):
                year_stats[doc['year']] += 1
        
        # Spelling variants by period
        variants_by_period = defaultdict(int)
        for variant in self.linguistic_features['spelling_variants']:
            variants_by_period[variant['period']] += 1
        
        # Token statistics
        pos_distribution = defaultdict(int)
        archaic_count = 0
        
        for token in self.tokens:
            if token.get('pos'):
                pos_distribution[token['pos']] += 1
            if token.get('has_archaic_spelling'):
                archaic_count += 1
        
        stats = {
            'processing_summary': self.stats,
            'corpus_stats': {
                'total_documents': len(self.documents),
                'total_tokens': len(self.tokens),
                'total_sentences': sum(len(doc.get('sentences', [])) for doc in self.documents),
                'average_tokens_per_doc': len(self.tokens) / len(self.documents) if self.documents else 0,
                'average_sentences_per_doc': sum(len(doc.get('sentences', [])) for doc in self.documents) / len(self.documents) if self.documents else 0
            },
            'temporal_distribution': {
                'period_distribution': dict(period_stats),
                'year_distribution': dict(sorted(year_stats.items()))
            },
            'genre_distribution': dict(genre_stats),
            'linguistic_features': {
                'pos_distribution': dict(pos_distribution),
                'spelling_variants_total': len(self.linguistic_features['spelling_variants']),
                'variants_by_period': dict(variants_by_period),
                'archaic_spelling_count': archaic_count,
                'spelling_variant_rate': len(self.linguistic_features['spelling_variants']) / len(self.tokens) if self.tokens else 0,
                'archaic_spelling_rate': archaic_count / len(self.tokens) if self.tokens else 0
            }
        }
        
        stats_file = self.output_dir / "statistics.json"
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)
    
    def print_summary(self):
        """Print processing summary"""
        print(f"\n🎉 GATE XML Processing complete!")
        print(f"📄 Total files: {self.stats['total_files']}")
        print(f"✅ Processed: {self.stats['processed']}")
        print(f"❌ XML errors: {self.stats['xml_errors']}")
        print(f"⚠️ Processing errors: {self.stats['processing_errors']}")
        print(f"📭 Empty files: {self.stats['empty_files']}")
        print(f"🔤 Total tokens: {len(self.tokens)}")
        print(f"📚 Total documents: {len(self.documents)}")
        
        if self.linguistic_features['spelling_variants']:
            print(f"🔄 Spelling variants: {len(self.linguistic_features['spelling_variants'])}")
            
        if self.tokens:
            archaic_count = sum(1 for t in self.tokens if t.get('has_archaic_spelling'))
            print(f"📜 Archaic spellings: {archaic_count} ({archaic_count/len(self.tokens)*100:.1f}%)")

def main():
    """Run GATE XML preprocessing"""
    # Update these paths to match your setup
    organized_dir = "/Users/rohan/Downloads/2544/organized_germanc"  # From Phase 1
    output_dir = "/Users/rohan/Downloads/2544/processed_germanc"     # Phase 2 output
    
    processor = GateXMLPreprocessor(organized_dir, output_dir)
    stats = processor.process_all_files()
    
    print(f"\n📊 GATE XML Preprocessing complete!")
    print(f"Ready for Phase 3: Database creation")
    
    return stats

if __name__ == "__main__":
    main()

Processing GATE XML files:   0%| | 0/336 [00:00<?, ?file/s, file=HUMA_P2_NoD_1732025-06-15 23:50:24,998 - INFO - ✓ Processed: HUMA_P2_NoD_1737_Koenigstein.xml (2339 tokens)
Processing GATE XML files:   0%| | 1/336 [00:00<01:54,  2.92file/s, file=HUMA_P22025-06-15 23:50:25,333 - INFO - ✓ Processed: HUMA_P2_OMD_1725_Hass.xml (2305 tokens)
Processing GATE XML files:   1%| | 2/336 [00:00<01:52,  2.96file/s, file=HUMA_P22025-06-15 23:50:25,742 - INFO - ✓ Processed: HUMA_P2_WOD_1744_Pfaltz.xml (2331 tokens)
Processing GATE XML files:   1%| | 3/336 [00:01<02:03,  2.70file/s, file=HUMA_P22025-06-15 23:50:26,233 - INFO - ✓ Processed: HUMA_P2_OOD_1707_HundertNarren.xml (2483 tokens)
Processing GATE XML files:   1%| | 4/336 [00:01<02:18,  2.39file/s, file=HUMA_P22025-06-15 23:50:26,626 - INFO - ✓ Processed: HUMA_P2_WMD_1737_Curiositaeten.xml (2449 tokens)
Processing GATE XML files:   1%| | 5/336 [00:01<02:15,  2.44file/s, file=HUMA_P22025-06-15 23:50:26,989 - INFO - ✓ Processed: HUMA_P2_OOD_1704_

📁 Saved to /Users/rohan/Downloads/2544/processed_germanc/
   - documents.json (336 docs)
   - tokens.csv (774373 tokens)
   - linguistic_features.json
   - statistics.json

🎉 GATE XML Processing complete!
📄 Total files: 336
✅ Processed: 336
❌ XML errors: 0
⚠️ Processing errors: 0
📭 Empty files: 0
🔤 Total tokens: 774373
📚 Total documents: 336
🔄 Spelling variants: 115929
📜 Archaic spellings: 117413 (15.2%)

📊 GATE XML Preprocessing complete!
Ready for Phase 3: Database creation


# Preprocessing Validations

In [5]:
#!/usr/bin/env python3
"""
GerManC Preprocessing Validation Suite
Ensures preprocessing extracts correct data for RAG pipeline
"""

import json
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
import re
from typing import Dict, List, Tuple
import logging

class GerManCValidator:
    def __init__(self, processed_dir):
        self.processed_dir = Path(processed_dir)
        self.validation_results = {}
        self.critical_errors = []
        self.warnings = []
        
        # Load processed data
        self.load_processed_data()
        
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def load_processed_data(self):
        """Load all processed data files"""
        try:
            # Load documents
            with open(self.processed_dir / "documents.json", 'r', encoding='utf-8') as f:
                self.documents = json.load(f)
            
            # Load tokens
            self.tokens_df = pd.read_csv(self.processed_dir / "tokens.csv")
            
            # Load linguistic features
            with open(self.processed_dir / "linguistic_features.json", 'r', encoding='utf-8') as f:
                self.linguistic_features = json.load(f)
            
            # Load statistics
            with open(self.processed_dir / "statistics.json", 'r', encoding='utf-8') as f:
                self.statistics = json.load(f)
                
        except Exception as e:
            self.critical_errors.append(f"Failed to load processed data: {e}")
            raise
    
    def run_full_validation(self):
        """Run complete validation suite"""
        print("🔍 Running GerManC Preprocessing Validation Suite...")
        print("=" * 60)
        
        # Critical validations for RAG pipeline
        self.validate_temporal_data()
        self.validate_spelling_variants()
        self.validate_linguistic_features()
        self.validate_text_quality()
        self.validate_data_completeness()
        self.validate_rag_readiness()
        
        # Generate validation report
        self.generate_validation_report()
        
        # Create sample queries to test data
        self.test_sample_queries()
        
        return self.validation_results
    
    def validate_temporal_data(self):
        """Validate temporal distribution and dating"""
        print("\n📅 Validating Temporal Data...")
        
        temporal_issues = []
        
        # Check if we have documents across time periods
        periods = set()
        years = []
        
        for doc in self.documents:
            if doc.get('period'):
                periods.add(doc['period'])
            if doc.get('year') and doc['year'] > 1500 and doc['year'] < 2000:
                years.append(doc['year'])
            elif doc.get('year'):
                temporal_issues.append(f"Suspicious year: {doc['year']} in {doc['filename']}")
        
        # Validate temporal coverage
        if len(periods) < 2:
            self.critical_errors.append("Insufficient temporal periods - need multiple periods for evolution tracking")
        
        if len(years) < len(self.documents) * 0.8:
            self.warnings.append("Many documents missing year information")
        
        # Check temporal distribution
        year_range = max(years) - min(years) if years else 0
        
        self.validation_results['temporal'] = {
            'periods_found': list(periods),
            'year_range': year_range,
            'docs_with_years': len(years),
            'temporal_coverage': year_range > 200,  # Should span centuries
            'issues': temporal_issues
        }
        
        print(f"   ✓ Periods found: {list(periods)}")
        print(f"   ✓ Year range: {min(years) if years else 'N/A'} - {max(years) if years else 'N/A'}")
        print(f"   ✓ Documents with years: {len(years)}/{len(self.documents)}")
    
    def validate_spelling_variants(self):
        """Validate spelling variant extraction - CRITICAL for RAG"""
        print("\n🔤 Validating Spelling Variants...")
        
        spelling_issues = []
        variants = self.linguistic_features.get('spelling_variants', [])
        
        if not variants:
            self.critical_errors.append("NO SPELLING VARIANTS FOUND - Critical for language evolution tracking!")
            return
        
        # Check for meaningful variants
        valid_variants = 0
        archaic_patterns = 0
        
        for variant in variants:
            original = variant.get('original', '')
            normalized = variant.get('normalized', '')
            
            if original and normalized and original != normalized:
                valid_variants += 1
                
                # Check for known archaic patterns
                if self.has_archaic_patterns(original):
                    archaic_patterns += 1
            else:
                spelling_issues.append(f"Invalid variant: {original} -> {normalized}")
        
        # Validate variant quality
        variant_rate = len(variants) / len(self.tokens_df) if not self.tokens_df.empty else 0
        
        if variant_rate < 0.05:  # Less than 5% variants seems low for historical texts
            self.warnings.append(f"Low spelling variant rate ({variant_rate:.2%}) - expected higher for historical texts")
        
        # Test specific known changes
        self.test_known_spelling_changes(variants)
        
        self.validation_results['spelling_variants'] = {
            'total_variants': len(variants),
            'valid_variants': valid_variants,
            'archaic_patterns': archaic_patterns,
            'variant_rate': variant_rate,
            'issues': spelling_issues
        }
        
        print(f"   ✓ Total variants: {len(variants)}")
        print(f"   ✓ Valid variants: {valid_variants}")
        print(f"   ✓ Variant rate: {variant_rate:.2%}")
        print(f"   ✓ Archaic patterns found: {archaic_patterns}")
    
    def has_archaic_patterns(self, word):
        """Check for known archaic German patterns"""
        archaic_patterns = [
            r'.*th.*',      # thun -> tun
            r'.*uo.*',      # guot -> gut
            r'.*ey.*',      # archaic diphthongs
            r'.*ck$',       # archaic endings
            r'^v[aeiou]',   # vmb -> um
        ]
        
        return any(re.search(pattern, word.lower()) for pattern in archaic_patterns)
    
    def test_known_spelling_changes(self, variants):
        """Test for specific known German spelling changes"""
        print("   📝 Testing known spelling changes...")
        
        known_changes = {
            'th_to_t': (r'.*th.*', r'.*t.*'),      # thun -> tun
            'uo_to_u': (r'.*uo.*', r'.*u.*'),      # guot -> gut  
            'v_to_u': (r'^v.*', r'^u.*'),          # vmb -> um
            'ck_changes': (r'.*ck.*', r'.*k.*'),   # various ck changes
        }
        
        found_changes = defaultdict(int)
        
        for variant in variants:
            original = variant.get('original', '').lower()
            normalized = variant.get('normalized', '').lower()
            
            for change_type, (old_pattern, new_pattern) in known_changes.items():
                if re.search(old_pattern, original) and re.search(new_pattern, normalized):
                    found_changes[change_type] += 1
        
        for change_type, count in found_changes.items():
            print(f"      - {change_type}: {count} examples")
        
        if not found_changes:
            self.warnings.append("No known spelling change patterns detected")
    
    def validate_linguistic_features(self):
        """Validate POS tags and morphological features"""
        print("\n🏷️ Validating Linguistic Features...")
        
        linguistic_issues = []
        
        # Check POS tag distribution
        pos_tags = self.tokens_df['pos'].value_counts() if 'pos' in self.tokens_df.columns else pd.Series()
        
        # Expected major POS categories for German
        expected_pos = ['NN', 'ART', 'VVFIN', 'ADV', 'ADJA', 'APPR', 'PRON']
        found_major_pos = 0
        
        if not pos_tags.empty:
            found_major_pos = sum(1 for pos in expected_pos if any(pos in tag for tag in pos_tags.index))
        
        if found_major_pos < 4:
            self.warnings.append("Few major POS categories found - check POS tagging")
        
        # Check morphological features
        morph_features = self.tokens_df['morphology'].dropna() if 'morphology' in self.tokens_df.columns else pd.Series()
        
        self.validation_results['linguistic_features'] = {
            'pos_tag_count': len(pos_tags),
            'major_pos_found': found_major_pos,
            'morph_features_present': len(morph_features) > 0,
            'pos_distribution': dict(pos_tags.head(10)) if not pos_tags.empty else {},
            'issues': linguistic_issues
        }
        
        print(f"   ✓ POS tags found: {len(pos_tags)}")
        print(f"   ✓ Major POS categories: {found_major_pos}/7")
        print(f"   ✓ Morphological features: {'Yes' if len(morph_features) > 0 else 'No'}")
    
    def validate_text_quality(self):
        """Validate text extraction quality"""
        print("\n📝 Validating Text Quality...")
        
        quality_issues = []
        avg_token_length = 0
        
        # Check for empty or very short texts
        short_docs = 0
        empty_docs = 0
        
        for doc in self.documents:
            token_count = doc.get('token_count', 0)
            if token_count == 0:
                empty_docs += 1
            elif token_count < 50:  # Very short documents
                short_docs += 1
        
        # Check for reasonable sentence lengths
        if not self.tokens_df.empty:
            avg_token_length = self.tokens_df['word_length'].mean() if 'word_length' in self.tokens_df.columns else 0
            
            if avg_token_length > 0 and (avg_token_length < 3 or avg_token_length > 15):
                quality_issues.append(f"Unusual average word length: {avg_token_length:.1f}")
        
        self.validation_results['text_quality'] = {
            'empty_documents': empty_docs,
            'short_documents': short_docs,
            'average_token_length': avg_token_length,
            'issues': quality_issues
        }
        
        print(f"   ✓ Empty documents: {empty_docs}")
        print(f"   ✓ Short documents: {short_docs}")
        if avg_token_length > 0:
            print(f"   ✓ Average token length: {avg_token_length:.1f}")
        else:
            print("   ⚠ No word length data available")
    
    def validate_data_completeness(self):
        """Validate data completeness for RAG pipeline"""
        print("\n📊 Validating Data Completeness...")
        
        completeness_issues = []
        
        # Check essential fields
        essential_fields = ['doc_id', 'period', 'genre', 'original', 'normalized']
        missing_fields = []
        
        if self.tokens_df.empty:
            self.critical_errors.append("No token data found!")
            return
        
        for field in essential_fields:
            if field not in self.tokens_df.columns:
                missing_fields.append(field)
            elif self.tokens_df[field].isna().sum() > len(self.tokens_df) * 0.1:  # >10% missing
                completeness_issues.append(f"Many missing values in {field}")
        
        if missing_fields:
            self.critical_errors.append(f"Missing essential fields: {missing_fields}")
        
        # Check genre distribution
        genres = self.tokens_df['genre'].value_counts() if 'genre' in self.tokens_df.columns else pd.Series()
        
        self.validation_results['completeness'] = {
            'total_tokens': len(self.tokens_df),
            'missing_fields': missing_fields,
            'genre_distribution': dict(genres) if not genres.empty else {},
            'issues': completeness_issues
        }
        
        print(f"   ✓ Total tokens: {len(self.tokens_df)}")
        print(f"   ✓ Missing essential fields: {missing_fields}")
        print(f"   ✓ Genres found: {list(genres.keys()) if not genres.empty else 'None'}")
    
    def validate_rag_readiness(self):
        """Validate data is ready for RAG pipeline"""
        print("\n🤖 Validating RAG Readiness...")
        
        rag_issues = []
        readiness_score = 0
        
        # Check 1: Temporal evolution data
        if len(self.validation_results.get('temporal', {}).get('periods_found', [])) >= 2:
            readiness_score += 20
        else:
            rag_issues.append("Need multiple time periods for evolution tracking")
        
        # Check 2: Spelling variants
        if self.validation_results.get('spelling_variants', {}).get('total_variants', 0) > 100:
            readiness_score += 25
        else:
            rag_issues.append("Need more spelling variants for language change analysis")
        
        # Check 3: Sufficient data volume
        if len(self.tokens_df) > 10000:
            readiness_score += 20
        else:
            rag_issues.append("Need more tokens for robust analysis")
        
        # Check 4: Linguistic features
        if self.validation_results.get('linguistic_features', {}).get('pos_tag_count', 0) > 10:
            readiness_score += 15
        else:
            rag_issues.append("Need more linguistic features")
        
        # Check 5: Data quality
        if self.validation_results.get('text_quality', {}).get('empty_documents', 0) < len(self.documents) * 0.1:
            readiness_score += 20
        else:
            rag_issues.append("Too many empty/poor quality documents")
        
        is_ready = readiness_score >= 80
        
        self.validation_results['rag_readiness'] = {
            'readiness_score': readiness_score,
            'is_ready': is_ready,
            'issues': rag_issues
        }
        
        print(f"   📊 RAG Readiness Score: {readiness_score}/100")
        print(f"   {'✅ READY for RAG pipeline' if is_ready else '❌ NOT READY - fix issues first'}")
    
    def test_sample_queries(self):
        """Test sample queries that the RAG system should handle"""
        print("\n🔍 Testing Sample RAG Queries...")
        
        sample_queries = [
            "How did 'thun' become 'tun'?",
            "When did German spelling change?",
            "What words changed between 1600-1800?",
            "Show evolution of verb endings",
            "Compare religious vs scientific language"
        ]
        
        query_results = {}
        
        for query in sample_queries:
            # Simulate query processing
            relevant_data = self.simulate_query_processing(query)
            query_results[query] = relevant_data
            
            if relevant_data['can_answer']:
                print(f"   ✅ '{query}' - Can answer with {relevant_data['evidence_count']} examples")
            else:
                print(f"   ❌ '{query}' - Insufficient data")
                self.warnings.append(f"Cannot answer query: {query}")
        
        self.validation_results['sample_queries'] = query_results
    
    def simulate_query_processing(self, query):
        """Simulate how well we can answer a query with current data"""
        # Simple simulation - check if we have relevant data
        
        can_answer = True
        evidence_count = 0
        
        # Check for spelling evolution queries
        if any(word in query.lower() for word in ['thun', 'tun', 'spelling', 'change']):
            variants = self.linguistic_features.get('spelling_variants', [])
            evidence_count = len([v for v in variants if 'th' in v.get('original', '').lower()])
            can_answer = evidence_count > 5
        
        # Check for temporal queries
        elif any(word in query.lower() for word in ['when', 'between', 'evolution']):
            periods = self.validation_results.get('temporal', {}).get('periods_found', [])
            can_answer = len(periods) >= 2
            evidence_count = len(periods) * 100  # Estimate
        
        # Check for genre comparison queries
        elif any(word in query.lower() for word in ['religious', 'scientific', 'compare']):
            genres = self.validation_results.get('completeness', {}).get('genre_distribution', {})
            can_answer = len(genres) >= 2
            evidence_count = sum(genres.values()) if isinstance(genres, dict) and genres else 0
        
        else:
            # General query
            evidence_count = len(self.tokens_df) // 10
            can_answer = len(self.tokens_df) > 1000
        
        return {
            'can_answer': can_answer,
            'evidence_count': evidence_count
        }
    
    def convert_to_json_serializable(self, obj):
        """Convert pandas types to JSON serializable types"""
        if hasattr(obj, 'item'):  # pandas scalar
            return obj.item()
        elif hasattr(obj, 'tolist'):  # pandas array/series
            return obj.tolist()
        elif isinstance(obj, dict):
            return {k: self.convert_to_json_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_to_json_serializable(item) for item in obj]
        else:
            return obj
    
    def generate_validation_report(self):
        """Generate comprehensive validation report"""
        print("\n📋 Generating Validation Report...")
        
        report_path = self.processed_dir / "validation_report.json"
        
        report = {
            'validation_summary': {
                'critical_errors': len(self.critical_errors),
                'warnings': len(self.warnings),
                'overall_status': 'READY' if not self.critical_errors else 'NEEDS_FIXES'
            },
            'critical_errors': self.critical_errors,
            'warnings': self.warnings,
            'detailed_results': self.validation_results,
            'recommendations': self.generate_recommendations()
        }
        
        # Convert pandas types to JSON serializable
        report = self.convert_to_json_serializable(report)
        
        with open(report_path, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)
        
        print(f"   📄 Report saved to: {report_path}")
        
        # Print summary
        print("\n" + "="*60)
        print("🎯 VALIDATION SUMMARY")
        print("="*60)
        
        if self.critical_errors:
            print("❌ CRITICAL ERRORS:")
            for error in self.critical_errors:
                print(f"   • {error}")
        
        if self.warnings:
            print("\n⚠️ WARNINGS:")
            for warning in self.warnings:
                print(f"   • {warning}")
        
        if not self.critical_errors:
            print("✅ NO CRITICAL ERRORS - Data ready for RAG pipeline!")
        else:
            print(f"\n❌ Fix {len(self.critical_errors)} critical errors before proceeding")
        
        return report
    
    def generate_recommendations(self):
        """Generate recommendations for fixing issues"""
        recommendations = []
        
        if self.critical_errors:
            recommendations.append("Fix all critical errors before building RAG system")
        
        if self.validation_results.get('spelling_variants', {}).get('total_variants', 0) < 100:
            recommendations.append("Improve spelling variant extraction - check normalization rules")
        
        if len(self.validation_results.get('temporal', {}).get('periods_found', [])) < 3:
            recommendations.append("Add more temporal periods for better evolution tracking")
        
        if self.validation_results.get('rag_readiness', {}).get('readiness_score', 0) < 80:
            recommendations.append("Address RAG readiness issues before deployment")
        
        return recommendations

def main():
    """Run validation on processed GerManC data"""
    processed_dir = "/Users/rohan/Downloads/2544/processed_germanc"  # Update path
    
    print("🔍 GerManC Preprocessing Validation")
    print("==================================")
    
    validator = GerManCValidator(processed_dir)
    results = validator.run_full_validation()
    
    print("\n🎉 Validation Complete!")
    print("Check validation_report.json for detailed results")
    
    return results

if __name__ == "__main__":
    main()

🔍 GerManC Preprocessing Validation
🔍 Running GerManC Preprocessing Validation Suite...

📅 Validating Temporal Data...
   ✓ Periods found: ['1750-1800', '1650-1700', '1700-1750']
   ✓ Year range: 1654 - 1799
   ✓ Documents with years: 336/336

🔤 Validating Spelling Variants...
   📝 Testing known spelling changes...
      - ck_changes: 4900 examples
      - th_to_t: 7573 examples
      - uo_to_u: 10 examples
      - v_to_u: 1884 examples
   ✓ Total variants: 115929
   ✓ Valid variants: 115929
   ✓ Variant rate: 14.97%
   ✓ Archaic patterns found: 22508

🏷️ Validating Linguistic Features...
   ✓ POS tags found: 62
   ✓ Major POS categories: 6/7
   ✓ Morphological features: Yes

📝 Validating Text Quality...
   ✓ Empty documents: 0
   ✓ Short documents: 0
   ✓ Average token length: 4.8

📊 Validating Data Completeness...
   ✓ Total tokens: 774373
   ✓ Missing essential fields: []
   ✓ Genres found: ['Drama', 'Newspapers', 'Humanities', 'Narrative', 'Legal', 'Scientific', 'Sermons']

🤖 Valida

In [7]:
#!/usr/bin/env python3
"""
Phase 2: PREPARE - Pre-ACCESS Pipeline
Transform GerManC tokens into structured temporal chunks ready for PostgreSQL
"""

import pandas as pd
import json
import numpy as np
from pathlib import Path
from collections import defaultdict, Counter
from typing import List, Dict, Any
import xml.etree.ElementTree as ET
from xml.dom import minidom
import logging
from tqdm import tqdm

class GerManCPrepareProcessor:
    def __init__(self, input_dir: str, output_dir: str):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        self.load_data()
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def load_data(self):
        """Load processed GerManC data"""
        print("📚 Loading data...")
        
        self.tokens_df = pd.read_csv(self.input_dir / "tokens.csv")
        
        with open(self.input_dir / "documents.json", 'r', encoding='utf-8') as f:
            self.documents = json.load(f)
        
        with open(self.input_dir / "linguistic_features.json", 'r', encoding='utf-8') as f:
            self.linguistic_features = json.load(f)
        
        print(f"✓ {len(self.tokens_df)} tokens, {len(self.documents)} documents")
    
    def create_temporal_chunks(self, chunk_size: int = 800) -> List[Dict]:
        """Create temporal chunks from tokens"""
        print(f"\n🔄 Creating temporal chunks (size: {chunk_size})...")
        
        chunks = []
        
        for doc in tqdm(self.documents, desc="Processing documents"):
            doc_id = doc['doc_id']
            doc_tokens = self.tokens_df[self.tokens_df['doc_id'] == doc_id]
            
            if len(doc_tokens) == 0:
                continue
            
            doc_chunks = self.chunk_document(doc, doc_tokens, chunk_size)
            chunks.extend(doc_chunks)
        
        print(f"✓ Created {len(chunks)} chunks")
        return chunks
    
    def chunk_document(self, doc: Dict, tokens: pd.DataFrame, chunk_size: int) -> List[Dict]:
        """Split document into chunks"""
        chunks = []
        
        if 'position' in tokens.columns:
            tokens = tokens.sort_values('position')
        
        current_chunk = []
        current_size = 0
        
        for _, token in tokens.iterrows():
            current_chunk.append(token)
            current_size += 1
            
            if current_size >= chunk_size or self.is_sentence_end(token):
                if len(current_chunk) > 50:
                    chunk = self.create_chunk(doc, current_chunk)
                    chunks.append(chunk)
                
                current_chunk = []
                current_size = 0
        
        if len(current_chunk) > 50:
            chunk = self.create_chunk(doc, current_chunk)
            chunks.append(chunk)
        
        return chunks
    
    def is_sentence_end(self, token) -> bool:
        """Check sentence boundary"""
        original = str(token.get('original', ''))
        return original.endswith(('.', '!', '?'))
    
    def create_chunk(self, doc: Dict, tokens: List) -> Dict:
        """Create chunk with metadata"""
        original_text = ' '.join([str(token.get('original', '')) for token in tokens])
        normalized_text = ' '.join([str(token.get('normalized', '')) for token in tokens])
        
        spelling_variants = self.extract_spelling_variants(tokens)
        linguistic_features = self.calculate_linguistic_features(tokens)
        
        return {
            'chunk_id': f"{doc['doc_id']}_chunk_{len(tokens)}",
            'doc_id': doc['doc_id'],
            'original_text': original_text,
            'normalized_text': normalized_text,
            'period': doc.get('period', 'unknown'),
            'genre': doc.get('genre', 'unknown'),
            'year': doc.get('year'),
            'filename': doc.get('filename', ''),
            'token_count': len(tokens),
            'spelling_variants': spelling_variants,
            'linguistic_features': linguistic_features
        }
    
    def extract_spelling_variants(self, tokens: List) -> List[Dict]:
        """Extract spelling variants from tokens"""
        variants = []
        
        for i, token in enumerate(tokens):
            original = str(token.get('original', ''))
            normalized = str(token.get('normalized', ''))
            
            if original != normalized and len(original) > 2:
                variants.append({
                    'original': original,
                    'normalized': normalized,
                    'pos': token.get('pos', ''),
                    'position_in_chunk': i
                })
        
        return variants
    
    def calculate_linguistic_features(self, tokens: List) -> Dict:
        """Calculate chunk-level linguistic features"""
        pos_counts = Counter()
        word_lengths = []
        
        for token in tokens:
            pos = token.get('pos', 'UNKNOWN')
            pos_counts[pos] += 1
            
            original = str(token.get('original', ''))
            if original:
                word_lengths.append(len(original))
        
        return {
            'pos_distribution': dict(pos_counts),
            'avg_word_length': np.mean(word_lengths) if word_lengths else 0,
            'total_tokens': len(tokens),
            'unique_pos_count': len(pos_counts),
            'spelling_variant_count': sum(1 for token in tokens 
                                        if str(token.get('original', '')) != str(token.get('normalized', '')))
        }
    
    def create_postgresql_tables(self, chunks: List[Dict]):
        """Create PostgreSQL-ready data structures"""
        print("\n💾 Creating PostgreSQL tables...")
        
        # Chunks table
        chunks_data = []
        for chunk in chunks:
            chunks_data.append({
                'chunk_id': chunk['chunk_id'],
                'doc_id': chunk['doc_id'],
                'period': chunk['period'],
                'genre': chunk['genre'],
                'year': chunk['year'],
                'filename': chunk['filename'],
                'normalized_text': chunk['normalized_text'],
                'original_text': chunk['original_text'],
                'token_count': chunk['token_count']
            })
        
        chunks_df = pd.DataFrame(chunks_data)
        chunks_path = self.output_dir / "chunks_table.csv"
        chunks_df.to_csv(chunks_path, index=False, encoding='utf-8')
        
        # Spelling variants table
        variants_data = []
        for chunk in chunks:
            for variant in chunk['spelling_variants']:
                variants_data.append({
                    'chunk_id': chunk['chunk_id'],
                    'period': chunk['period'],
                    'genre': chunk['genre'],
                    'original': variant['original'],
                    'normalized': variant['normalized'],
                    'pos': variant['pos'],
                    'position_in_chunk': variant['position_in_chunk']
                })
        
        if variants_data:
            variants_df = pd.DataFrame(variants_data)
            variants_path = self.output_dir / "spelling_variants_table.csv"
            variants_df.to_csv(variants_path, index=False, encoding='utf-8')
        
        print(f"✓ PostgreSQL tables saved")
        return chunks_df
    
    def create_word_frequencies_table(self, chunks: List[Dict]):
        """Create word frequencies table for PostgreSQL"""
        print("\n📊 Creating word frequencies table...")
        
        word_freq_data = []
        
        for chunk in chunks:
            # Count words in normalized text
            words = chunk['normalized_text'].lower().split()
            word_counts = Counter(words)
            
            for word, count in word_counts.items():
                if len(word) > 2:  # Filter short words
                    word_freq_data.append({
                        'word': word,
                        'period': chunk['period'],
                        'genre': chunk['genre'],
                        'frequency': count,
                        'chunk_id': chunk['chunk_id']
                    })
        
        if word_freq_data:
            word_freq_df = pd.DataFrame(word_freq_data)
            word_freq_path = self.output_dir / "word_frequencies_table.csv"
            word_freq_df.to_csv(word_freq_path, index=False, encoding='utf-8')
            print(f"✓ Word frequencies table saved")
            return word_freq_df
        
        return pd.DataFrame()
    
    def create_linguistic_features_database(self, chunks: List[Dict]):
        """Create linguistic features database for PostgreSQL"""
        print("\n📊 Creating features database...")
        
        features_data = []
        
        for chunk in chunks:
            chunk_id = chunk['chunk_id']
            period = chunk['period']
            genre = chunk['genre']
            
            # POS features
            for pos, count in chunk['linguistic_features']['pos_distribution'].items():
                features_data.append({
                    'chunk_id': chunk_id,
                    'period': period,
                    'genre': genre,
                    'feature_type': 'pos',
                    'feature_name': pos,
                    'frequency': count,
                    'relative_frequency': count / chunk['linguistic_features']['total_tokens']
                })
            
            # Spelling variants
            for variant in chunk['spelling_variants']:
                features_data.append({
                    'chunk_id': chunk_id,
                    'period': period,
                    'genre': genre,
                    'feature_type': 'spelling_variant',
                    'feature_name': f"{variant['original']}→{variant['normalized']}",
                    'frequency': 1,
                    'relative_frequency': 1 / chunk['token_count']
                })
        
        # Save as CSV for PostgreSQL import
        features_df = pd.DataFrame(features_data)
        features_path = self.output_dir / "linguistic_features_db.csv"
        features_df.to_csv(features_path, index=False, encoding='utf-8')
        
        print(f"✓ Features database saved to {features_path}")
        return features_df
    
    def calculate_statistics(self, chunks: List[Dict]) -> Dict:
        """Calculate corpus statistics"""
        periods = defaultdict(int)
        genres = defaultdict(int)
        token_counts = []
        variant_counts = []
        
        for chunk in chunks:
            periods[chunk['period']] += 1
            genres[chunk['genre']] += 1
            token_counts.append(chunk['token_count'])
            variant_counts.append(len(chunk['spelling_variants']))
        
        return {
            'total_chunks': len(chunks),
            'period_distribution': dict(periods),
            'genre_distribution': dict(genres),
            'token_statistics': {
                'mean': float(np.mean(token_counts)),
                'median': float(np.median(token_counts)),
                'total': int(np.sum(token_counts))
            },
            'variant_statistics': {
                'mean_per_chunk': float(np.mean(variant_counts)),
                'total_variants': int(np.sum(variant_counts))
            }
        }
    
    def run_prepare_phase(self, chunk_size: int = 800):
        """Execute complete PREPARE phase"""
        print("🚀 Phase 2: PREPARE")
        print("=" * 40)
        
        # 1. Create temporal chunks
        chunks = self.create_temporal_chunks(chunk_size)
        
        # 2. Create PostgreSQL tables
        chunks_df = self.create_postgresql_tables(chunks)
        
        # 3. Create word frequencies table
        word_freq_df = self.create_word_frequencies_table(chunks)
        
        # 4. Create linguistic features database
        features_df = self.create_linguistic_features_database(chunks)
        
        # 5. Save chunks as JSON
        chunks_path = self.output_dir / "temporal_chunks.json"
        with open(chunks_path, 'w', encoding='utf-8') as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False, default=str)
        
        # 6. Calculate and save statistics
        stats = self.calculate_statistics(chunks)
        stats_path = self.output_dir / "prepare_statistics.json"
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)
        
        print(f"\n✅ PREPARE phase completed!")
        print(f"📊 {len(chunks)} chunks created")
        print(f"📁 Output: {self.output_dir}")
        print(f"📋 Ready for Phase 3: ACCESS")
        
        return chunks, features_df, stats

def main():
    """Run PREPARE phase"""
    input_dir = "/Users/rohan/Downloads/2544/processed_germanc"
    output_dir = "/Users/rohan/Downloads/2544/prepare_output"
    
    processor = GerManCPrepareProcessor(input_dir, output_dir)
    chunks, features_df, stats = processor.run_prepare_phase()
    
    return chunks, features_df, stats

if __name__ == "__main__":
    chunks, features_df, stats = main()

📚 Loading data...
✓ 774373 tokens, 336 documents
🚀 Phase 2: PREPARE

🔄 Creating temporal chunks (size: 800)...


Processing documents: 100%|███████████████████| 336/336 [01:09<00:00,  4.87it/s]


✓ Created 3607 chunks

💾 Creating PostgreSQL tables...
✓ PostgreSQL tables saved

📊 Creating word frequencies table...
✓ Word frequencies table saved

📊 Creating features database...
✓ Features database saved to /Users/rohan/Downloads/2544/prepare_output/linguistic_features_db.csv

✅ PREPARE phase completed!
📊 3607 chunks created
📁 Output: /Users/rohan/Downloads/2544/prepare_output
📋 Ready for Phase 3: ACCESS


In [13]:
#!/usr/bin/env python3
"""
Phase 3: ACCESS - Database Setup & Query Interface
Import PREPARE data into PostgreSQL and create REST API
"""

import pandas as pd
import psycopg2
from sqlalchemy import create_engine, text
from pathlib import Path
import json
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from typing import List, Dict, Optional
from pydantic import BaseModel
import logging

class AccessPhaseSetup:
    def __init__(self, prepare_output_dir: str, db_config: Dict):
        self.prepare_dir = Path(prepare_output_dir)
        self.db_config = db_config
        self.engine = self.create_db_connection()
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def create_db_connection(self):
        """Create SQLAlchemy engine"""
        connection_string = f"postgresql://{self.db_config['user']}:{self.db_config['password']}@{self.db_config['host']}:{self.db_config['port']}/{self.db_config['database']}"
        return create_engine(connection_string)
    
    def create_database_schema(self):
        """Create PostgreSQL tables"""
        print("📊 Creating database schema...")
        
        schema_sql = """
        -- Chunks table
        CREATE TABLE IF NOT EXISTS chunks (
            chunk_id VARCHAR(255) PRIMARY KEY,
            doc_id VARCHAR(255),
            period VARCHAR(10),
            genre VARCHAR(100),
            year INTEGER,
            filename VARCHAR(255),
            normalized_text TEXT,
            original_text TEXT,
            token_count INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );
        
        -- Spelling variants table
        CREATE TABLE IF NOT EXISTS spelling_variants (
            id SERIAL PRIMARY KEY,
            chunk_id VARCHAR(255) REFERENCES chunks(chunk_id),
            period VARCHAR(10),
            genre VARCHAR(100),
            original VARCHAR(255),
            normalized VARCHAR(255),
            pos VARCHAR(50),
            position_in_chunk INTEGER
        );
        
        -- Word frequencies table
        CREATE TABLE IF NOT EXISTS word_frequencies (
            id SERIAL PRIMARY KEY,
            word VARCHAR(255),
            period VARCHAR(10),
            genre VARCHAR(100),
            frequency INTEGER,
            chunk_id VARCHAR(255) REFERENCES chunks(chunk_id)
        );
        
        -- Linguistic features table
        CREATE TABLE IF NOT EXISTS linguistic_features (
            id SERIAL PRIMARY KEY,
            chunk_id VARCHAR(255) REFERENCES chunks(chunk_id),
            period VARCHAR(10),
            genre VARCHAR(100),
            feature_type VARCHAR(50),
            feature_name VARCHAR(255),
            frequency INTEGER,
            relative_frequency FLOAT
        );
        """
        
        with self.engine.connect() as conn:
            conn.execute(text(schema_sql))
            conn.commit()
        
        print("✓ Database schema created")
    
    def create_indexes(self):
        """Create performance indexes"""
        print("🚀 Creating indexes...")
        
        indexes_sql = """
        -- Temporal indexes
        CREATE INDEX IF NOT EXISTS idx_chunks_period ON chunks(period);
        CREATE INDEX IF NOT EXISTS idx_chunks_genre ON chunks(period, genre);
        CREATE INDEX IF NOT EXISTS idx_chunks_year ON chunks(year);
        
        -- Full-text search
        CREATE INDEX IF NOT EXISTS idx_chunks_text_gin ON chunks USING gin(to_tsvector('german', normalized_text));
        
        -- Spelling variants indexes
        CREATE INDEX IF NOT EXISTS idx_variants_period ON spelling_variants(period);
        CREATE INDEX IF NOT EXISTS idx_variants_original ON spelling_variants(original);
        CREATE INDEX IF NOT EXISTS idx_variants_normalized ON spelling_variants(normalized);
        
        -- Word frequency indexes
        CREATE INDEX IF NOT EXISTS idx_word_freq_word ON word_frequencies(word);
        CREATE INDEX IF NOT EXISTS idx_word_freq_period ON word_frequencies(word, period);
        
        -- Features indexes
        CREATE INDEX IF NOT EXISTS idx_features_type ON linguistic_features(feature_type);
        CREATE INDEX IF NOT EXISTS idx_features_period ON linguistic_features(period, feature_type);
        """
        
        with self.engine.connect() as conn:
            conn.execute(text(indexes_sql))
            conn.commit()
        
        print("✓ Indexes created")
    
    def drop_tables_cascade(self):
        """Drop existing tables with CASCADE"""
        drop_sql = """
        DROP TABLE IF EXISTS linguistic_features CASCADE;
        DROP TABLE IF EXISTS word_frequencies CASCADE;
        DROP TABLE IF EXISTS spelling_variants CASCADE;
        DROP TABLE IF EXISTS chunks CASCADE;
        """
        
        with self.engine.connect() as conn:
            conn.execute(text(drop_sql))
            conn.commit()
    
    def import_data_tables(self):
        """Import CSV data from PREPARE phase"""
        print("📥 Importing data...")
        
        # Drop existing tables first
        self.drop_tables_cascade()
        
        # Import chunks
        chunks_path = self.prepare_dir / "chunks_table.csv"
        if chunks_path.exists():
            chunks_df = pd.read_csv(chunks_path)
            chunks_df.to_sql('chunks', self.engine, if_exists='replace', index=False)
            print(f"✓ Imported {len(chunks_df)} chunks")
        
        # Import spelling variants
        variants_path = self.prepare_dir / "spelling_variants_table.csv"
        if variants_path.exists():
            variants_df = pd.read_csv(variants_path)
            variants_df.to_sql('spelling_variants', self.engine, if_exists='replace', index=False)
            print(f"✓ Imported {len(variants_df)} spelling variants")
        
        # Import word frequencies
        word_freq_path = self.prepare_dir / "word_frequencies_table.csv"
        if word_freq_path.exists():
            word_freq_df = pd.read_csv(word_freq_path)
            word_freq_df.to_sql('word_frequencies', self.engine, if_exists='replace', index=False)
            print(f"✓ Imported {len(word_freq_df)} word frequencies")
        
        # Import linguistic features
        features_path = self.prepare_dir / "linguistic_features_db.csv"
        if features_path.exists():
            features_df = pd.read_csv(features_path)
            features_df.to_sql('linguistic_features', self.engine, if_exists='replace', index=False)
            print(f"✓ Imported {len(features_df)} linguistic features")
    
    def setup_database(self):
        """Complete database setup"""
        print("🗃️ Phase 3: ACCESS - Database Setup")
        print("=" * 40)
        
        self.create_database_schema()
        self.import_data_tables()
        self.create_indexes()
        
        print("✅ Database setup completed")

# API Models
class QueryRequest(BaseModel):
    query: str
    period: Optional[str] = None
    genre: Optional[str] = None
    limit: Optional[int] = 100

class EvolutionQuery(BaseModel):
    word: str
    start_period: str
    end_period: str

# FastAPI Application
class GerManCAPI:
    def __init__(self, db_config: Dict):
        self.db_config = db_config
        self.engine = create_engine(
            f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
        )
        self.app = FastAPI(title="GerManC Historical Linguistics API", version="1.0.0")
        self.setup_routes()
        self.setup_cors()
    
    def setup_cors(self):
        """Setup CORS middleware"""
        self.app.add_middleware(
            CORSMiddleware,
            allow_origins=["*"],
            allow_credentials=True,
            allow_methods=["*"],
            allow_headers=["*"],
        )
    
    def setup_routes(self):
        """Setup API routes"""
        
        @self.app.get("/")
        def root():
            return {"message": "GerManC Historical Linguistics API", "version": "1.0.0"}
        
        @self.app.get("/evolution/{word}/{start_period}/{end_period}")
        def query_word_evolution(word: str, start_period: str, end_period: str):
            """Track word evolution across periods"""
            try:
                query = text("""
                    SELECT period, COUNT(*) as frequency, 
                           ARRAY_AGG(DISTINCT original) as variants
                    FROM spelling_variants 
                    WHERE (original = :word OR normalized = :word)
                    AND period BETWEEN :start_period AND :end_period
                    GROUP BY period 
                    ORDER BY period
                """)
                
                with self.engine.connect() as conn:
                    result = conn.execute(query, {
                        'word': word,
                        'start_period': start_period,
                        'end_period': end_period
                    })
                    
                    evolution_data = []
                    for row in result:
                        evolution_data.append({
                            'period': row.period,
                            'frequency': row.frequency,
                            'variants': row.variants
                        })
                    
                    return {
                        'word': word,
                        'period_range': f"{start_period}-{end_period}",
                        'evolution': evolution_data
                    }
            
            except Exception as e:
                raise HTTPException(status_code=500, detail=str(e))
        
        @self.app.post("/linguistic_analysis")
        def linguistic_analysis(request: QueryRequest):
            """Perform linguistic pattern analysis"""
            try:
                conditions = []
                params = {}
                
                if request.period:
                    conditions.append("period = :period")
                    params['period'] = request.period
                
                if request.genre:
                    conditions.append("genre = :genre")
                    params['genre'] = request.genre
                
                where_clause = " AND ".join(conditions) if conditions else "1=1"
                
                query = text(f"""
                    SELECT feature_type, feature_name, 
                           SUM(frequency) as total_frequency,
                           AVG(relative_frequency) as avg_relative_frequency
                    FROM linguistic_features 
                    WHERE {where_clause}
                    GROUP BY feature_type, feature_name
                    ORDER BY total_frequency DESC
                    LIMIT :limit
                """)
                
                params['limit'] = request.limit
                
                with self.engine.connect() as conn:
                    result = conn.execute(query, params)
                    
                    analysis_data = []
                    for row in result:
                        analysis_data.append({
                            'feature_type': row.feature_type,
                            'feature_name': row.feature_name,
                            'total_frequency': row.total_frequency,
                            'avg_relative_frequency': float(row.avg_relative_frequency)
                        })
                    
                    return {
                        'filters': {
                            'period': request.period,
                            'genre': request.genre
                        },
                        'results': analysis_data
                    }
            
            except Exception as e:
                raise HTTPException(status_code=500, detail=str(e))
        
        @self.app.get("/temporal_patterns")
        def temporal_patterns():
            """Get temporal distribution patterns"""
            try:
                query = text("""
                    SELECT period, genre, COUNT(*) as chunk_count,
                           AVG(token_count) as avg_token_count
                    FROM chunks
                    GROUP BY period, genre
                    ORDER BY period, genre
                """)
                
                with self.engine.connect() as conn:
                    result = conn.execute(query)
                    
                    patterns = []
                    for row in result:
                        patterns.append({
                            'period': row.period,
                            'genre': row.genre,
                            'chunk_count': row.chunk_count,
                            'avg_token_count': float(row.avg_token_count)
                        })
                    
                    return {'temporal_patterns': patterns}
            
            except Exception as e:
                raise HTTPException(status_code=500, detail=str(e))
        
        @self.app.get("/search/{query}")
        def full_text_search(query: str, period: Optional[str] = None, limit: int = 50):
            """Full-text search in historical texts"""
            try:
                conditions = ["to_tsvector('german', normalized_text) @@ plainto_tsquery('german', :query)"]
                params = {'query': query, 'limit': limit}
                
                if period:
                    conditions.append("period = :period")
                    params['period'] = period
                
                where_clause = " AND ".join(conditions)
                
                search_query = text(f"""
                    SELECT chunk_id, period, genre, 
                           ts_headline('german', normalized_text, plainto_tsquery('german', :query)) as highlighted_text,
                           ts_rank(to_tsvector('german', normalized_text), plainto_tsquery('german', :query)) as rank
                    FROM chunks
                    WHERE {where_clause}
                    ORDER BY rank DESC
                    LIMIT :limit
                """)
                
                with self.engine.connect() as conn:
                    result = conn.execute(search_query, params)
                    
                    search_results = []
                    for row in result:
                        search_results.append({
                            'chunk_id': row.chunk_id,
                            'period': row.period,
                            'genre': row.genre,
                            'highlighted_text': row.highlighted_text,
                            'relevance_score': float(row.rank)
                        })
                    
                    return {
                        'query': query,
                        'period_filter': period,
                        'results': search_results
                    }
            
            except Exception as e:
                raise HTTPException(status_code=500, detail=str(e))
    
    def run(self, host: str = "127.0.0.1", port: int = 8000):
        """Start API server"""
        import nest_asyncio
        nest_asyncio.apply()
        uvicorn.run(self.app, host=host, port=port)

def main():
    """Run Phase 3: ACCESS setup"""
    
    # Database configuration
    db_config = {
        'host': 'localhost',
        'port': 5432,
        'database': 'germanc_corpus',
        'user': 'rohan',  # Adjust as needed
        'password': 1996  # Adjust as needed
    }
    
    prepare_output_dir = "/Users/rohan/Downloads/2544/prepare_output"
    
    # Setup database
    setup = AccessPhaseSetup(prepare_output_dir, db_config)
    setup.setup_database()
    
    print("\n✅ Phase 3: ACCESS completed!")
    print("📊 Database ready with tables:")
    print("  - chunks")
    print("  - spelling_variants") 
    print("  - word_frequencies")
    print("  - linguistic_features")
    print("\n🔄 Ready for Phase 4: RAG")
    
    # Optional: Start API server (comment out if not needed)
    """
    print("\n🚀 Starting API server...")
    api = GerManCAPI(db_config)
    
    print("📡 API Endpoints:")
    print("  GET  /evolution/{word}/{start_period}/{end_period}")
    print("  POST /linguistic_analysis")
    print("  GET  /temporal_patterns")
    print("  GET  /search/{query}")
    print(f"\n🌐 Server running at: http://localhost:8000")
    print("📖 API docs at: http://localhost:8000/docs")
    
    api.run()
    """

if __name__ == "__main__":
    main()

🗃️ Phase 3: ACCESS - Database Setup
📊 Creating database schema...
✓ Database schema created
📥 Importing data...
✓ Imported 3607 chunks


  key: (


✓ Imported 48390 spelling variants
✓ Imported 193019 word frequencies
✓ Imported 131405 linguistic features
🚀 Creating indexes...
✓ Indexes created
✅ Database setup completed

✅ Phase 3: ACCESS completed!
📊 Database ready with tables:
  - chunks
  - spelling_variants
  - word_frequencies
  - linguistic_features

🔄 Ready for Phase 4: RAG


In [8]:
! pip3 install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.0-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.0


In [1]:
! pip install sentence-transformers chromadb langchain transformers torch

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting chromadb
  Downloading chromadb-1.0.13-cp39-abi3-macosx_10_12_x86_64.whl.metadata (7.0 kB)
Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch
  Downloading torch-2.2.2-cp312-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (8.4 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-5.3.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chroma

In [14]:
#!/usr/bin/env python3
"""
Phase 4: RAG Pipeline for German Historical Language Evolution
================================================================

This script builds a Retrieval-Augmented Generation (RAG) system that:
1. Loads chunks from PostgreSQL database
2. Creates embeddings using sentence-transformers
3. Stores embeddings in ChromaDB vector database
4. Provides semantic search and question-answering capabilities

Requirements:
- PostgreSQL with German corpus data (from Phase 3)
- sentence-transformers, chromadb, langchain
"""

import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
import json
import logging
from datetime import datetime

# Database and vector store
import psycopg2
from sqlalchemy import create_engine, text
import chromadb
from chromadb.config import Settings

# Embeddings and LLM
from sentence_transformers import SentenceTransformer
import openai
from transformers import pipeline

# LangChain components
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class GermanRAGPipeline:
    """RAG Pipeline for German Historical Corpus"""
    
    def __init__(self, db_config: Dict[str, Any], vector_db_path: str = "./chroma_db"):
        """
        Initialize RAG Pipeline
        
        Args:
            db_config: PostgreSQL database configuration
            vector_db_path: Path to ChromaDB vector database
        """
        self.db_config = db_config
        self.vector_db_path = Path(vector_db_path)
        self.vector_db_path.mkdir(exist_ok=True)
        
        # Database connection
        self.engine = create_engine(
            f"postgresql://{db_config['user']}:{db_config['password']}@"
            f"{db_config['host']}:{db_config['port']}/{db_config['database']}"
        )
        
        # Initialize embedding model (German-specific)
        print("🔄 Loading German sentence transformer model...")
        self.embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        
        # Initialize ChromaDB
        self.chroma_client = chromadb.PersistentClient(path=str(self.vector_db_path))
        self.collection_name = "german_corpus_chunks"
        
        # LangChain embeddings wrapper
        self.langchain_embeddings = HuggingFaceEmbeddings(
            model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
        )
        
        self.vectorstore = None
        self.qa_chain = None
        
    def load_chunks_from_db(self, limit: Optional[int] = None) -> pd.DataFrame:
        """Load text chunks from PostgreSQL database"""
        
        print("📚 Loading chunks from PostgreSQL...")
        
        # Based on the actual table structure:
        # ['token_count', 'year', 'period', 'genre', 'chunk_id', 'filename', 'normalized_text', 'original_text', 'doc_id']
        
        query = """
        SELECT 
            c.chunk_id,
            c.normalized_text as text,
            c.period,
            c.token_count as word_count,
            LENGTH(c.normalized_text) as char_count,
            c.doc_id as document_id,
            ROW_NUMBER() OVER (PARTITION BY c.doc_id ORDER BY c.chunk_id) as chunk_index,
            c.year,
            c.genre,
            c.filename
        FROM chunks c
        WHERE c.normalized_text IS NOT NULL 
        AND LENGTH(c.normalized_text) > 50
        ORDER BY c.period, c.doc_id, c.chunk_id
        """
        
        if limit:
            query += f" LIMIT {limit}"
            
        with self.engine.connect() as conn:
            df = pd.read_sql(query, conn)
            
        print(f"✅ Loaded {len(df)} chunks from database")
        print(f"📊 Periods: {df['period'].unique()}")
        print(f"📊 Genres: {df['genre'].unique()}")
        return df
    
    def create_embeddings(self, chunks_df: pd.DataFrame, batch_size: int = 32) -> None:
        """Create embeddings for text chunks and store in ChromaDB"""
        
        print("🧠 Creating embeddings for text chunks...")
        
        # Remove duplicates based on chunk_id
        original_len = len(chunks_df)
        chunks_df = chunks_df.drop_duplicates(subset=['chunk_id'], keep='first')
        deduped_len = len(chunks_df)
        
        if original_len != deduped_len:
            print(f"⚠️  Removed {original_len - deduped_len} duplicate chunk IDs")
        
        # Create or get collection
        try:
            collection = self.chroma_client.get_collection(self.collection_name)
            print(f"📦 Found existing collection with {collection.count()} documents")
            
            # Check if we need to add new chunks
            existing_ids = set(collection.get()['ids'])
            new_chunks = chunks_df[~chunks_df['chunk_id'].astype(str).isin(existing_ids)]
            
            if len(new_chunks) == 0:
                print("✅ All chunks already embedded")
                return
            else:
                print(f"🔄 Adding {len(new_chunks)} new chunks")
                chunks_df = new_chunks
                
        except Exception:
            collection = self.chroma_client.create_collection(
                name=self.collection_name,
                metadata={"description": "German historical corpus text chunks"}
            )
            print("📦 Created new ChromaDB collection")
        
        # Process chunks in batches
        total_chunks = len(chunks_df)
        
        for i in range(0, total_chunks, batch_size):
            batch = chunks_df.iloc[i:i+batch_size]
            
            print(f"Processing batch {i//batch_size + 1}/{(total_chunks-1)//batch_size + 1}")
            
            # Prepare texts and metadata
            texts = batch['text'].tolist()
            
            # Ensure unique IDs by adding row index if needed
            chunk_ids = []
            for idx, row in batch.iterrows():
                base_id = str(row['chunk_id'])
                # Add row index to ensure uniqueness
                unique_id = f"{base_id}_{idx}"
                chunk_ids.append(unique_id)
            
            # Create embeddings
            embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
            
            # Prepare metadata
            metadatas = []
            for _, row in batch.iterrows():
                metadata = {
                    'period': str(row['period']),
                    'document_id': str(row['document_id']),
                    'chunk_index': int(row['chunk_index']),
                    'word_count': int(row['word_count']),
                    'char_count': int(row['char_count']),
                    'year': str(row.get('year', '')),
                    'genre': str(row.get('genre', '')),
                    'filename': str(row.get('filename', '')),
                    'original_chunk_id': str(row['chunk_id'])  # Keep original ID in metadata
                }
                metadatas.append(metadata)
            
            # Add to ChromaDB
            collection.add(
                embeddings=embeddings.tolist(),
                documents=texts,
                metadatas=metadatas,
                ids=chunk_ids
            )
        
        print(f"✅ Created embeddings for {total_chunks} chunks")
        print(f"📊 Total documents in collection: {collection.count()}")
    
    def setup_langchain_vectorstore(self) -> None:
        """Setup LangChain Chroma vectorstore for QA"""
        
        print("🔗 Setting up LangChain vectorstore...")
        
        self.vectorstore = Chroma(
            persist_directory=str(self.vector_db_path),
            embedding_function=self.langchain_embeddings,
            collection_name=self.collection_name
        )
        
        print("✅ LangChain vectorstore ready")
    
    def setup_qa_chain(self, llm_provider: str = "simple") -> None:
        """Setup QA chain with LLM"""
        
        print(f"🤖 Setting up QA chain with {llm_provider}...")
        
        if llm_provider == "openai":
            # OpenAI GPT (requires API key)
            if not os.getenv("OPENAI_API_KEY"):
                raise ValueError("OPENAI_API_KEY environment variable required")
            llm = OpenAI(temperature=0.1, max_tokens=1000)
            
        elif llm_provider == "simple":
            # Simple text-based retrieval without LLM generation
            print("⚠️  Using simple retrieval mode (no LLM generation)")
            self.qa_chain = None  # We'll handle this differently
            return
            
        elif llm_provider == "huggingface":
            # Try a smaller, safer model
            from langchain_community.llms import HuggingFacePipeline
            
            try:
                # Use a smaller model that should work
                hf_pipeline = pipeline(
                    "text-generation",
                    model="distilgpt2",  # Smaller, more compatible model
                    max_length=256,
                    temperature=0.1,
                    do_sample=True,
                    pad_token_id=50256
                )
                llm = HuggingFacePipeline(pipeline=hf_pipeline)
            except Exception as e:
                print(f"⚠️  HuggingFace model loading failed: {e}")
                print("Falling back to simple retrieval mode")
                self.qa_chain = None
                return
        
        else:
            raise ValueError(f"Unsupported LLM provider: {llm_provider}")
        
        # Create retrieval QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(search_kwargs={"k": 5}),
            return_source_documents=True
        )
        
        print("✅ QA chain ready")
    
    def semantic_search(self, query: str, k: int = 5, period_filter: Optional[str] = None) -> List[Dict]:
        """Perform semantic search on the corpus"""
        
        collection = self.chroma_client.get_collection(self.collection_name)
        
        # Build where clause for filtering
        where_clause = None
        if period_filter:
            where_clause = {"period": period_filter}
        
        # Perform search
        results = collection.query(
            query_texts=[query],
            n_results=k,
            where=where_clause
        )
        
        # Format results
        formatted_results = []
        for i in range(len(results['documents'][0])):
            result = {
                'text': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'distance': results['distances'][0][i] if 'distances' in results else None,
                'chunk_id': results['ids'][0][i]
            }
            formatted_results.append(result)
        
        return formatted_results
    
    def ask_question(self, question: str, period_filter: Optional[str] = None) -> Dict[str, Any]:
        """Ask a question using the RAG system"""
        
        if self.qa_chain is None:
            # Simple retrieval mode - return relevant chunks without LLM generation
            print("Using simple retrieval mode (no LLM generation)")
            
            # Perform semantic search
            results = self.semantic_search(question, k=5, period_filter=period_filter)
            
            # Create a simple answer from the most relevant chunks
            answer_parts = []
            for i, result in enumerate(results[:3]):
                answer_parts.append(f"[Chunk {i+1}]: {result['text'][:200]}...")
            
            simple_answer = "\n\n".join(answer_parts)
            
            return {
                'question': question,
                'answer': simple_answer,
                'source_documents': [
                    {
                        'content': result['text'],
                        'metadata': result['metadata']
                    }
                    for result in results
                ],
                'mode': 'simple_retrieval'
            }
        
        # If period filter specified, create a filtered retriever
        if period_filter:
            filtered_retriever = self.vectorstore.as_retriever(
                search_kwargs={
                    "k": 5,
                    "filter": {"period": period_filter}
                }
            )
            
            # Create temporary QA chain with filtered retriever
            from langchain.chains import RetrievalQA
            temp_qa_chain = RetrievalQA.from_chain_type(
                llm=self.qa_chain.combine_documents_chain.llm_chain.llm,
                chain_type="stuff",
                retriever=filtered_retriever,
                return_source_documents=True
            )
            result = temp_qa_chain({"query": question})
        else:
            result = self.qa_chain({"query": question})
        
        return {
            'question': question,
            'answer': result['result'],
            'source_documents': [
                {
                    'content': doc.page_content,
                    'metadata': doc.metadata
                }
                for doc in result['source_documents']
            ],
            'mode': 'llm_generation'
        }
    
    def analyze_language_evolution(self, word: str, periods: List[str] = None) -> Dict[str, Any]:
        """Analyze how a word/concept evolved across time periods"""
        
        if not periods:
            periods = ['1050-1350', '1350-1650', '1650-1800', '1800-1900', '1900-2000']
        
        evolution_analysis = {
            'word': word,
            'periods': {},
            'summary': ''
        }
        
        for period in periods:
            # Search for word usage in specific period
            search_query = f"Verwendung des Wortes '{word}'"
            results = self.semantic_search(search_query, k=3, period_filter=period)
            
            evolution_analysis['periods'][period] = {
                'examples': results,
                'context_count': len(results)
            }
        
        # Generate evolution summary
        if self.qa_chain:
            summary_question = f"Wie hat sich das Wort '{word}' in der deutschen Sprache über die Zeit entwickelt?"
            summary = self.ask_question(summary_question)
            evolution_analysis['summary'] = summary['answer']
        else:
            # Simple summary without LLM
            total_contexts = sum(len(data['examples']) for data in evolution_analysis['periods'].values())
            evolution_analysis['summary'] = f"Found {total_contexts} contexts for '{word}' across {len(periods)} time periods."
        
        return evolution_analysis
    
    def get_statistics(self) -> Dict[str, Any]:
        """Get corpus and vector database statistics"""
        
        # PostgreSQL stats
        with self.engine.connect() as conn:
            db_stats = {}
            
            # Chunk statistics (using correct column names)
            chunk_stats = conn.execute(text("""
                SELECT 
                    COUNT(*) as total_chunks,
                    AVG(token_count) as avg_word_count,
                    MIN(period) as earliest_period,
                    MAX(period) as latest_period,
                    COUNT(DISTINCT period) as period_count,
                    COUNT(DISTINCT genre) as genre_count
                FROM chunks
            """)).fetchone()
            
            db_stats['chunks'] = dict(chunk_stats._mapping)
            
            # Period distribution
            period_dist = conn.execute(text("""
                SELECT period, COUNT(*) as chunk_count
                FROM chunks
                GROUP BY period
                ORDER BY period
            """)).fetchall()
            
            db_stats['period_distribution'] = [dict(row._mapping) for row in period_dist]
            
            # Genre distribution
            genre_dist = conn.execute(text("""
                SELECT genre, COUNT(*) as chunk_count
                FROM chunks
                GROUP BY genre
                ORDER BY chunk_count DESC
            """)).fetchall()
            
            db_stats['genre_distribution'] = [dict(row._mapping) for row in genre_dist]
        
        # ChromaDB stats
        try:
            collection = self.chroma_client.get_collection(self.collection_name)
            vector_stats = {
                'total_embeddings': collection.count(),
                'collection_name': self.collection_name
            }
        except Exception:
            vector_stats = {'total_embeddings': 0, 'collection_name': 'Not created'}
        
        return {
            'database_stats': db_stats,
            'vector_stats': vector_stats,
            'model_info': {
                'embedding_model': 'paraphrase-multilingual-MiniLM-L12-v2',
                'vector_db_path': str(self.vector_db_path)
            }
        }

def main():
    """Run Phase 4: RAG Pipeline setup"""
    
    print("🗃️ Phase 4: RAG Pipeline for German Historical Corpus")
    print("=" * 60)
    
    # Database configuration (adjust as needed)
    db_config = {
        'host': 'localhost',
        'port': 5432,
        'database': 'germanc_corpus',
        'user': 'rohan',
        'password': ''
    }
    
    # Initialize RAG pipeline
    rag = GermanRAGPipeline(db_config, vector_db_path="./german_corpus_vectordb")
    
    try:
        # Step 1: Load chunks from database
        chunks_df = rag.load_chunks_from_db(limit=1000)  # Start with 1000 chunks for testing
        
        # Step 2: Create embeddings and store in ChromaDB
        rag.create_embeddings(chunks_df)
        
        # Step 3: Setup LangChain components
        rag.setup_langchain_vectorstore()
        
        # Step 4: Setup QA chain (using simple mode to avoid PyTorch issues)
        rag.setup_qa_chain(llm_provider="simple")
        
        # Step 5: Test the system
        print("\n🧪 Testing RAG system...")
        
        # Test semantic search
        print("\n1. Semantic Search Test:")
        search_results = rag.semantic_search("deutsche Sprache mittelalter", k=3)
        for i, result in enumerate(search_results, 1):
            print(f"   {i}. Period: {result['metadata']['period']}")
            print(f"      Text: {result['text'][:100]}...")
            print()
        
        # Test question answering
        print("\n2. Question Answering Test:")
        qa_result = rag.ask_question("Wie entwickelte sich die deutsche Sprache im Mittelalter?")
        print(f"   Question: {qa_result['question']}")
        print(f"   Answer: {qa_result['answer'][:200]}...")
        print()
        
        # Test language evolution analysis
        print("\n3. Language Evolution Test:")
        evolution = rag.analyze_language_evolution("deutsch")
        print(f"   Analyzing word: {evolution['word']}")
        for period, data in evolution['periods'].items():
            print(f"   {period}: {data['context_count']} contexts found")
        
        # Show statistics
        print("\n📊 System Statistics:")
        stats = rag.get_statistics()
        print(f"   Total chunks in DB: {stats['database_stats']['chunks']['total_chunks']}")
        print(f"   Total embeddings: {stats['vector_stats']['total_embeddings']}")
        print(f"   Time periods: {stats['database_stats']['chunks']['period_count']}")
        
        print("\n✅ Phase 4: RAG Pipeline completed successfully!")
        print("\n🎯 Available capabilities:")
        print("   - Semantic search across historical German texts")
        print("   - Question answering about language evolution")  
        print("   - Period-specific analysis")
        print("   - Language change tracking")
        
        return rag
        
    except Exception as e:
        logger.error(f"Error in RAG pipeline: {e}")
        raise

if __name__ == "__main__":
    # Install required packages first:
    """
    pip install sentence-transformers chromadb langchain langchain-community langchain-huggingface openai transformers torch
    """
    
    rag_system = main()

2025-06-21 00:14:34,645 - INFO - Use pytorch device_name: mps
2025-06-21 00:14:34,646 - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


🗃️ Phase 4: RAG Pipeline for German Historical Corpus
🔄 Loading German sentence transformer model...


2025-06-21 00:14:38,493 - INFO - Use pytorch device_name: mps
2025-06-21 00:14:38,493 - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


📚 Loading chunks from PostgreSQL...
✅ Loaded 1000 chunks from database
📊 Periods: ['1650-1700']
📊 Genres: ['Drama' 'Humanities' 'Legal' 'Narrative' 'Newspapers' 'Scientific']
🧠 Creating embeddings for text chunks...
⚠️  Removed 98 duplicate chunk IDs
📦 Found existing collection with 902 documents
🔄 Adding 902 new chunks
Processing batch 1/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 2/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 3/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 4/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 5/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 6/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 7/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 8/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 9/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 10/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 11/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 12/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 13/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 14/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 15/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 16/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 17/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 18/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 19/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 20/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 21/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 22/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 23/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 24/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 25/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 26/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 27/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 28/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing batch 29/29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings for 902 chunks
📊 Total documents in collection: 902
🔗 Setting up LangChain vectorstore...
✅ LangChain vectorstore ready
🤖 Setting up QA chain with simple...
⚠️  Using simple retrieval mode (no LLM generation)

🧪 Testing RAG system...

1. Semantic Search Test:
   1. Period: 1650-1700
      Text: Römischen Reichs Müntz-Ordnung / Anno Fünfzehen hundert Fünfzig neun auffgericht / und hernacher auf...

   2. Period: 1650-1700
      Text: DAs freimachen soll mit Vorwissen des BergVoigts geschehen / und soll der Freimacher mit ein oder zw...

   3. Period: 1650-1700
      Text: eine Pfeif von dem Stengel hellebori oder Christwurtz gemacht / Kuriere mit ihrem sono die lymphatic...


2. Question Answering Test:
Using simple retrieval mode (no LLM generation)
   Question: Wie entwickelte sich die deutsche Sprache im Mittelalter?
   Answer: [Chunk 1]: Römischen Reichs Müntz-Ordnung / Anno Fünfzehen hundert Fünfzig neun auffgericht / und hernacher auf etlichen Reichs-Tägen / s