# Organising the files

In [1]:
import os
import re
import shutil
from pathlib import Path
from collections import defaultdict

class GerManCOrganizer:
    def __init__(self, source_dir, output_dir):
        self.source_dir = Path(source_dir)
        self.output_dir = Path(output_dir)
        
        # Genre mapping from filename prefixes
        self.genres = {
            'DRAM': 'Drama',
            'HUMA': 'Humanities', 
            'LEGA': 'Legal',
            'NARR': 'Narrative',
            'NEWS': 'Newspapers',
            'SCIE': 'Scientific',
            'SERM': 'Sermons'
        }
        
        # Time period bins
        self.periods = {
            'P1': '1650-1700',
            'P2': '1700-1750', 
            'P3': '1750-1800'
        }
        
    def extract_file_info(self, filename):
        """Extract genre, period, and year from filename"""
        # Pattern: GENRE_PERIOD_REGION_YEAR_TITLE.xml
        # Example: DRAM_P1_NoD_1673_Leonilda.xml
        
        pattern = r'([A-Z]{4})_([P][1-3])_([A-Za-z]+)_(\d{4})_(.+)\.xml'
        match = re.match(pattern, filename)
        
        if not match:
            return None
            
        genre_code, period_code, region, year, title = match.groups()
        
        return {
            'genre': self.genres.get(genre_code, 'Unknown'),
            'genre_code': genre_code,
            'period': self.periods.get(period_code, 'Unknown'),
            'period_code': period_code,
            'region': region,
            'year': int(year),
            'title': title,
            'filename': filename
        }
    
    def organize_files(self):
        """Main function to organize all LING-GATE files"""
        
        # Create output directory structure
        self.create_directory_structure()
        
        # Stats tracking
        stats = defaultdict(lambda: defaultdict(int))
        processed_files = []
        error_files = []
        
        # Process each XML file
        for xml_file in self.source_dir.glob("*.xml"):
            try:
                file_info = self.extract_file_info(xml_file.name)
                
                if file_info:
                    # Create destination path
                    dest_dir = self.output_dir / file_info['period'] / file_info['genre']
                    dest_dir.mkdir(parents=True, exist_ok=True)
                    
                    # Copy file to organized location
                    dest_path = dest_dir / xml_file.name
                    shutil.copy2(xml_file, dest_path)
                    
                    # Update stats
                    stats[file_info['period']][file_info['genre']] += 1
                    processed_files.append(file_info)
                    
                    print(f"✓ {xml_file.name} → {file_info['period']}/{file_info['genre']}")
                    
                else:
                    error_files.append(xml_file.name)
                    print(f"✗ Could not parse: {xml_file.name}")
                    
            except Exception as e:
                error_files.append(xml_file.name)
                print(f"✗ Error processing {xml_file.name}: {e}")
        
        # Generate summary report
        self.generate_report(stats, processed_files, error_files)
        
        return stats, processed_files, error_files
    
    def create_directory_structure(self):
        """Create organized directory structure"""
        for period in self.periods.values():
            for genre in self.genres.values():
                dir_path = self.output_dir / period / genre
                dir_path.mkdir(parents=True, exist_ok=True)
                
    def generate_report(self, stats, processed_files, error_files):
        """Generate organization summary report"""
        
        report_path = self.output_dir / "organization_report.txt"
        
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("GerManC Corpus Organization Report\n")
            f.write("=" * 40 + "\n\n")
            
            # Summary statistics
            f.write(f"Total files processed: {len(processed_files)}\n")
            f.write(f"Total files with errors: {len(error_files)}\n\n")
            
            # Files by period and genre
            f.write("Files by Period and Genre:\n")
            f.write("-" * 30 + "\n")
            
            for period in sorted(stats.keys()):
                f.write(f"\n{period}:\n")
                for genre in sorted(stats[period].keys()):
                    count = stats[period][genre]
                    f.write(f"  {genre}: {count} files\n")
            
            # Year distribution
            f.write("\n\nYear Distribution:\n")
            f.write("-" * 20 + "\n")
            years = [info['year'] for info in processed_files]
            if years:
                f.write(f"Earliest: {min(years)}\n")
                f.write(f"Latest: {max(years)}\n")
                f.write(f"Range: {max(years) - min(years)} years\n")
            
            # Error files
            if error_files:
                f.write("\n\nFiles with errors:\n")
                f.write("-" * 20 + "\n")
                for error_file in error_files:
                    f.write(f"  {error_file}\n")
        
        print(f"\n📊 Report saved to: {report_path}")

def main():
    """Example usage"""
    
    # Set your paths here
    source_directory = "/Users/rohan/Downloads/2544/LING-GATE/"  # Where your XML files are
    output_directory = "/Users/rohan/Downloads/2544/organized_germanc"                # Where to create organized structure
    
    # Create organizer and run
    organizer = GerManCOrganizer(source_directory, output_directory)
    stats, processed, errors = organizer.organize_files()
    
    # Print summary
    print(f"\n🎉 Organization complete!")
    print(f"📁 Organized {len(processed)} files")
    print(f"❌ {len(errors)} errors")
    print(f"📊 Check organization_report.txt for details")

if __name__ == "__main__":
    main()

✓ SERM_P1_WOD_1660_LeichPredigt.xml → 1650-1700/Sermons
✓ SERM_P3_OOD_1792_Sonntagen.xml → 1750-1800/Sermons
✓ LEGA_P1_WOD_1654_HoffgerichtsOrdnung.xml → 1650-1700/Legal
✓ SCIE_P2_WOD_1741_Erden.xml → 1700-1750/Scientific
✓ HUMA_P1_NoD_1667_Ratseburg.xml → 1650-1700/Humanities
✓ LEGA_P3_NoD_1751_FeuerOrdnung.xml → 1750-1800/Legal
✓ DRAM_P1_OOD_1675_Pirrus.xml → 1650-1700/Drama
✓ SCIE_P1_OOD_1681_CometenGespoetts.xml → 1650-1700/Scientific
✓ NARR_P1_OMD_1671_Ruebezahl.xml → 1650-1700/Narrative
✓ HUMA_P1_WMD_1692_Christus.xml → 1650-1700/Humanities
✓ HUMA_P3_WMD_1772_Baukunst.xml → 1750-1800/Humanities
✓ SERM_P2_OMD_1715_Beerdigung.xml → 1700-1750/Sermons
✓ SCIE_P3_NoD_1799_Gasarten.xml → 1750-1800/Scientific
✓ SCIE_P2_WMD_1744_SelbstArtzt.xml → 1700-1750/Scientific
✓ SERM_P3_OMD_1760_Folgen.xml → 1750-1800/Sermons
✓ SERM_P3_WOD_1792_Hegel.xml → 1750-1800/Sermons
✓ NEWS_P2_OOD_1702_muenchen2.xml → 1700-1750/Newspapers
✓ DRAM_P2_WOD_1748_Hoelle.xml → 1700-1750/Drama
✓ NARR_P2_NoD_1715_Afr

# Preprocessing 

In [3]:
#!/usr/bin/env python3
"""
GerManC LING-GATE XML Preprocessor - Phase 2: PREPARE
Extracts linguistic features from LING-GATE XML files and prepares data for RAG system.
"""

import xml.etree.ElementTree as ET
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict
import re

class GerManCPreprocessor:
    def __init__(self, organized_dir, output_dir):
        self.organized_dir = Path(organized_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Initialize data containers
        self.documents = []
        self.tokens = []
        self.linguistic_features = defaultdict(list)
        
    def process_all_files(self):
        """Process all organized XML files"""
        stats = defaultdict(int)
        
        # Process each period
        for period_dir in self.organized_dir.iterdir():
            if period_dir.is_dir():
                period = period_dir.name
                
                # Process each genre
                for genre_dir in period_dir.iterdir():
                    if genre_dir.is_dir():
                        genre = genre_dir.name
                        
                        # Process XML files
                        for xml_file in genre_dir.glob("*.xml"):
                            try:
                                doc_data = self.process_xml_file(xml_file, period, genre)
                                if doc_data:
                                    self.documents.append(doc_data)
                                    stats['processed'] += 1
                                    print(f"✓ Processed: {xml_file.name}")
                                    
                            except Exception as e:
                                stats['errors'] += 1
                                print(f"✗ Error processing {xml_file.name}: {e}")
        
        # Save processed data
        self.save_processed_data()
        
        print(f"\n🎉 Processing complete!")
        print(f"📄 Documents: {stats['processed']}")
        print(f"🔤 Tokens: {len(self.tokens)}")
        print(f"❌ Errors: {stats['errors']}")
        
        return stats
    
    def process_xml_file(self, xml_path, period, genre):
        """Process single LING-GATE XML file"""
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()
            
            # Extract document metadata
            doc_id = xml_path.stem
            doc_data = {
                'doc_id': doc_id,
                'period': period,
                'genre': genre,
                'filename': xml_path.name,
                'year': self.extract_year(xml_path.name),
                'region': self.extract_region(xml_path.name),
                'sentences': [],
                'token_count': 0,
                'unique_words': set()
            }
            
            # Process sentences and tokens
            sentence_id = 0
            
            # Find all sentence-like structures
            for sentence_elem in root.iter():
                if self.is_sentence_element(sentence_elem):
                    sentence_data = self.process_sentence(
                        sentence_elem, doc_id, sentence_id, period, genre
                    )
                    if sentence_data['tokens']:
                        doc_data['sentences'].append(sentence_data)
                        sentence_id += 1
            
            # Update document stats
            doc_data['sentence_count'] = len(doc_data['sentences'])
            doc_data['unique_words'] = len(doc_data['unique_words'])
            
            return doc_data
            
        except ET.ParseError as e:
            print(f"XML Parse Error in {xml_path}: {e}")
            return None
    
    def process_sentence(self, sentence_elem, doc_id, sent_id, period, genre):
        """Process sentence and extract tokens"""
        sentence_data = {
            'doc_id': doc_id,
            'sentence_id': sent_id,
            'text': '',
            'tokens': []
        }
        
        token_id = 0
        
        # Extract tokens from sentence
        for token_elem in sentence_elem.iter():
            if self.is_token_element(token_elem):
                token_data = self.extract_token_features(
                    token_elem, doc_id, sent_id, token_id, period, genre
                )
                if token_data:
                    sentence_data['tokens'].append(token_data)
                    self.tokens.append(token_data)
                    token_id += 1
        
        # Build sentence text
        sentence_data['text'] = ' '.join([t['original'] for t in sentence_data['tokens']])
        
        return sentence_data
    
    def extract_token_features(self, token_elem, doc_id, sent_id, token_id, period, genre):
        """Extract linguistic features from token element"""
        
        # Get token attributes
        original = token_elem.get('norm', token_elem.text or '')
        modern = token_elem.get('lemma', original)
        pos = token_elem.get('pos', '')
        morph = token_elem.get('morph', '')
        
        # Skip empty tokens
        if not original.strip():
            return None
        
        token_data = {
            'doc_id': doc_id,
            'sentence_id': sent_id,
            'token_id': token_id,
            'period': period,
            'genre': genre,
            'original': original.strip(),
            'normalized': modern.strip(),
            'pos': pos,
            'morphology': morph,
            'is_spelling_variant': original.lower() != modern.lower(),
            'word_length': len(original),
            'has_archaic_spelling': self.is_archaic_spelling(original)
        }
        
        # Track linguistic changes
        if token_data['is_spelling_variant']:
            self.linguistic_features['spelling_variants'].append({
                'original': original,
                'modern': modern,
                'period': period,
                'genre': genre,
                'pos': pos
            })
        
        return token_data
    
    def is_sentence_element(self, elem):
        """Check if element represents a sentence"""
        # Common sentence tags in LING-GATE
        sentence_tags = ['s', 'sentence', 'seg']
        return elem.tag.lower() in sentence_tags
    
    def is_token_element(self, elem):
        """Check if element represents a token/word"""
        # Common token tags in LING-GATE
        token_tags = ['w', 'word', 'token', 'pc']  # pc for punctuation
        return elem.tag.lower() in token_tags
    
    def is_archaic_spelling(self, word):
        """Identify archaic spelling patterns"""
        archaic_patterns = [
            r'.*th.*',  # thun, rath
            r'.*umb$',  # vmb -> um
            r'.*ey.*',  # archaic diphthongs
            r'.*ck$',   # certain archaic endings
            r'^v[aeiou]', # v- beginnings (vmb)
        ]
        
        word_lower = word.lower()
        return any(re.match(pattern, word_lower) for pattern in archaic_patterns)
    
    def extract_year(self, filename):
        """Extract year from filename"""
        match = re.search(r'_(\d{4})_', filename)
        return int(match.group(1)) if match else None
    
    def extract_region(self, filename):
        """Extract region code from filename"""
        match = re.search(r'_([A-Za-z]+)_\d{4}_', filename)
        return match.group(1) if match else None
    
    def save_processed_data(self):
        """Save all processed data in multiple formats"""
        
        # 1. Save documents metadata as JSON
        docs_file = self.output_dir / "documents.json"
        with open(docs_file, 'w', encoding='utf-8') as f:
            # Convert sets to lists for JSON serialization
            docs_for_json = []
            for doc in self.documents:
                doc_copy = doc.copy()
                doc_copy['unique_words'] = list(doc_copy['unique_words'])
                docs_for_json.append(doc_copy)
            json.dump(docs_for_json, f, indent=2, ensure_ascii=False)
        
        # 2. Save tokens as CSV for analysis
        tokens_df = pd.DataFrame(self.tokens)
        tokens_file = self.output_dir / "tokens.csv"
        tokens_df.to_csv(tokens_file, index=False, encoding='utf-8')
        
        # 3. Save linguistic features
        features_file = self.output_dir / "linguistic_features.json"
        with open(features_file, 'w', encoding='utf-8') as f:
            json.dump(dict(self.linguistic_features), f, indent=2, ensure_ascii=False)
        
        # 4. Save summary statistics
        self.save_statistics()
        
        print(f"📁 Saved to {self.output_dir}/")
        print(f"   - documents.json ({len(self.documents)} docs)")
        print(f"   - tokens.csv ({len(self.tokens)} tokens)")
        print(f"   - linguistic_features.json")
        print(f"   - statistics.json")
    
    def save_statistics(self):
        """Generate and save corpus statistics"""
        
        # Period distribution
        period_stats = defaultdict(int)
        for doc in self.documents:
            period_stats[doc['period']] += 1
        
        # Genre distribution
        genre_stats = defaultdict(int)
        for doc in self.documents:
            genre_stats[doc['genre']] += 1
        
        # Spelling variants by period
        variants_by_period = defaultdict(int)
        for variant in self.linguistic_features['spelling_variants']:
            variants_by_period[variant['period']] += 1
        
        # Token statistics
        token_df = pd.DataFrame(self.tokens)
        
        stats = {
            'total_documents': len(self.documents),
            'total_tokens': len(self.tokens),
            'period_distribution': dict(period_stats),
            'genre_distribution': dict(genre_stats),
            'spelling_variants_by_period': dict(variants_by_period),
            'average_tokens_per_doc': len(self.tokens) / len(self.documents) if self.documents else 0,
            'unique_pos_tags': token_df['pos'].nunique() if not token_df.empty else 0,
            'spelling_variant_rate': len(self.linguistic_features['spelling_variants']) / len(self.tokens) if self.tokens else 0
        }
        
        stats_file = self.output_dir / "statistics.json"
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)

def main():
    """Run preprocessing"""
    organized_dir = "/Users/rohan/Downloads/2544/organized_germanc"  # From Phase 1
    output_dir = "/Users/rohan/Downloads/2544/processed_germanc"     # Phase 2 output
    
    preprocessor = GerManCPreprocessor(organized_dir, output_dir)
    stats = preprocessor.process_all_files()
    
    print(f"\n📊 Preprocessing complete!")
    print(f"Ready for Phase 3: Database creation")

if __name__ == "__main__":
    main()

✓ Processed: HUMA_P2_NoD_1737_Koenigstein.xml
✓ Processed: HUMA_P2_OMD_1725_Hass.xml
✓ Processed: HUMA_P2_WOD_1744_Pfaltz.xml
✓ Processed: HUMA_P2_OOD_1707_HundertNarren.xml
✓ Processed: HUMA_P2_WMD_1737_Curiositaeten.xml
✓ Processed: HUMA_P2_OOD_1704_WasserKunst.xml
✓ Processed: HUMA_P2_OOD_1731_AntiquitaetenSchatz.xml
✓ Processed: HUMA_P2_WMD_1739_Stollberg.xml
✓ Processed: HUMA_P2_NoD_1720_Remarques.xml
✓ Processed: HUMA_P2_WOD_1740_Poesie.xml
✓ Processed: HUMA_P2_WMD_1748_Samuel.xml
✓ Processed: HUMA_P2_WOD_1741_Antiquitaeten.xml
✓ Processed: HUMA_P2_NoD_1739_MusicalischInterval.xml
✓ Processed: HUMA_P2_OMD_1717_DienstMaegde.xml
✓ Processed: HUMA_P2_OMD_1729_Biedermann.xml
✓ Processed: SERM_P2_OMD_1715_Beerdigung.xml
✓ Processed: SERM_P2_WMD_1702_Leben.xml
✓ Processed: SERM_P2_WMD_1721_HeilBronnen.xml
✓ Processed: SERM_P2_NoD_1715_Klugheit.xml
✓ Processed: SERM_P2_WOD_1739_Kranckentrost.xml
✓ Processed: SERM_P2_NoD_1730_JubelFeste.xml
✓ Processed: SERM_P2_OMD_1734_Evangelisch.xml
✓

TypeError: 'int' object is not iterable