In [7]:
from collections import Counter
import re
from typing import List, Set, Dict, Tuple

class EnhancedSpellingCorrector:
    def __init__(self):
        self.word_counts = Counter()
        self.misspelling_dict = {}
        self.debug_mode = True 
        
    def train_from_birkbeck(self, text: str):
        """Train from Birkbeck format ($correct\nmisspelling\nmisspelling...)"""
        current_word = None
        for line in text.split('\n'):
            line = line.strip()
            if not line:
                continue
            if line.startswith('$'):
                current_word = line[1:].lower().replace('_', ' ')
                self.word_counts[current_word] += 1
                if self.debug_mode:
                    print(f"Added correct word: {current_word}")
            elif current_word:
                misspelling = line.lower().replace('_', ' ')
                self.misspelling_dict[misspelling] = current_word
                if self.debug_mode:
                    print(f"Added misspelling: {misspelling} -> {current_word}")

    def train_from_holbrook(self, text: str):
        """Train from Holbrook tagged format."""
        err_pattern = re.compile(r'<ERR targ=([^>]+)>\s*([^<]+)\s*</ERR>')
        for match in err_pattern.finditer(text):
            correct = match.group(1).lower().replace('_', ' ')
            misspelling = match.group(2).lower().replace('_', ' ')
            if correct != '?':
                self.word_counts[correct] += 1
                self.misspelling_dict[misspelling] = correct
                if self.debug_mode:
                    print(f"Added Holbrook pair: {misspelling} -> {correct}")

    def correct(self, word: str) -> str:
        """Return the most probable spelling correction for the word."""
        word = word.lower()
        
        # If word is known, return it
        if word in self.word_counts:
            return word
            
        # If word is a known misspelling, return correction
        if word in self.misspelling_dict:
            if self.debug_mode:
                print(f"Found correction for {word}: {self.misspelling_dict[word]}")
            return self.misspelling_dict[word]
            
        # Get possible corrections (edit distance 1)
        candidates = self.get_edits(word)
        
        # Filter to known words and sort by frequency
        valid_candidates = [(w, self.word_counts[w]) for w in candidates if w in self.word_counts]
        
        if valid_candidates:
            best_candidate = max(valid_candidates, key=lambda x: x[1])[0]
            if self.debug_mode:
                print(f"Found edit distance correction for {word}: {best_candidate}")
            return best_candidate
        
        if self.debug_mode:
            print(f"No correction found for {word}, returning as is")
        return word

    def get_edits(self, word: str) -> Set[str]:
        """Get all strings that are one edit distance away from word."""
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        
        return set(deletes + transposes + replaces + inserts)
    
    def evaluate_all_corpora(self, texts: Dict[str, str]) -> Dict[str, Tuple[float, List[Tuple[str, str, str]]]]:
        """Evaluate corrector on all corpora."""
        results = {}
        for corpus_name, text in texts.items():
            accuracy, errors = self.evaluate(text)
            results[corpus_name] = (accuracy, errors)
        return results

    def evaluate(self, text: str) -> Tuple[float, List[Tuple[str, str, str]]]:
        """Evaluate corrector on test corpus."""
        correct = 0
        total = 0
        errors = []
        
        if '<ERR' in text:  # Holbrook format
            err_pattern = re.compile(r'<ERR targ=([^>]+)>\s*([^<]+)\s*</ERR>')
            for match in err_pattern.finditer(text):
                correct_word = match.group(1).lower().replace('_', ' ')
                misspelling = match.group(2).lower().replace('_', ' ')
                if correct_word != '?':
                    prediction = self.correct(misspelling)
                    total += 1
                    if prediction == correct_word:
                        correct += 1
                    else:
                        errors.append((misspelling, correct_word, prediction))
        else:  # Birkbeck/Aspell/Wikipedia format
            current_word = None
            for line in text.split('\n'):
                line = line.strip()
                if not line:
                    continue
                if line.startswith('$'):
                    current_word = line[1:].lower().replace('_', ' ')
                elif current_word:
                    misspelling = line.lower().replace('_', ' ')
                    prediction = self.correct(misspelling)
                    total += 1
                    if prediction == current_word:
                        correct += 1
                    else:
                        errors.append((misspelling, current_word, prediction))
        
        return (correct / total if total > 0 else 0.0), errors

def load_and_train_corrector(corpus_texts: Dict[str, str]) -> EnhancedSpellingCorrector:
    corrector = EnhancedSpellingCorrector()
    
    # Train on each corpus
    for corpus_name, text in corpus_texts.items():
        print(f"\nTraining on {corpus_name}...")
        if corpus_name == 'holbrook-tagged':
            corrector.train_from_holbrook(text)
        else:  # birkbeck, aspell, wikipedia formats
            corrector.train_from_birkbeck(text)
    
    # Print statistics
    print("\nTraining Statistics:")
    print(f"Known words: {len(corrector.word_counts)}")
    print(f"Known misspellings: {len(corrector.misspelling_dict)}")
    print("\nSample of correct words:", list(corrector.word_counts.keys())[:5])
    print("Sample of misspellings:", list(corrector.misspelling_dict.items())[:5])
    
    return corrector

# Test with sample data
sample_corpus_texts = {
    'birkbeck': '''$receive
recieve
receve
receeve
$separate
seperate
seprate
$accommodate
accomodate
acomodate''',
    
    'holbrook-tagged': '''Here is some text with <ERR targ=receive>recieve</ERR> and 
<ERR targ=separate>seperate</ERR> as errors.''',
    
    'aspell': '''$receive
recieve
$separate
seperate''',
    
    'wikipedia': '''$receive
recieve
$separate
seperate'''
}

# Create and train the corrector
print("Training corrector...")
corrector = load_and_train_corrector(sample_corpus_texts)

# Test some corrections
print("\nTesting corrections:")
test_words = ["recieve", "seperate", "occured", "accomodate"]
for word in test_words:
    correction = corrector.correct(word)
    print(f"'{word}' -> '{correction}'")

# Evaluate on all corpora
print("\nEvaluating on all corpora:")
results = corrector.evaluate_all_corpora(sample_corpus_texts)
for corpus_name, (accuracy, errors) in results.items():
    print(f"\n{corpus_name} Results:")
    print(f"Accuracy: {accuracy:.2%}")
    if errors:
        print("Sample errors (misspelling -> correct -> predicted):")
        for error in errors[:3]:
            print(f"  {error[0]} -> {error[1]} -> {error[2]}")

Training corrector...

Training on birkbeck...
Added correct word: receive
Added misspelling: recieve -> receive
Added misspelling: receve -> receive
Added misspelling: receeve -> receive
Added correct word: separate
Added misspelling: seperate -> separate
Added misspelling: seprate -> separate
Added correct word: accommodate
Added misspelling: accomodate -> accommodate
Added misspelling: acomodate -> accommodate

Training on holbrook-tagged...
Added Holbrook pair: recieve -> receive
Added Holbrook pair: seperate -> separate

Training on aspell...
Added correct word: receive
Added misspelling: recieve -> receive
Added correct word: separate
Added misspelling: seperate -> separate

Training on wikipedia...
Added correct word: receive
Added misspelling: recieve -> receive
Added correct word: separate
Added misspelling: seperate -> separate

Training Statistics:
Known words: 3
Known misspellings: 7

Sample of correct words: ['receive', 'separate', 'accommodate']
Sample of misspellings: [(

In [10]:
from collections import Counter, defaultdict
import re
from typing import List, Set, Dict, Tuple

class SpellingCorrector:
    def __init__(self):
        self.word_counts = Counter()
        self.misspelling_dict = {}
        self.misspelling_sources = defaultdict(set)  # Track which files each misspelling came from
        
    def train_from_corpus(self, text: str, source_file: str):
        """Train from corpus format ($correct\nmisspelling\nmisspelling...)"""
        current_word = None
        for line in text.split('\n'):
            line = line.strip()
            if not line:
                continue
            if line.startswith('$'):
                current_word = line[1:].lower().replace('_', ' ')
                self.word_counts[current_word] += 1
            elif current_word:
                misspelling = line.lower().replace('_', ' ')
                # Add or update misspelling
                if misspelling in self.misspelling_dict:
                    if self.misspelling_dict[misspelling] != current_word:
                        print(f"Warning: Conflicting correction for '{misspelling}':")
                        print(f"  Previous: {self.misspelling_dict[misspelling]}")
                        print(f"  New: {current_word}")
                self.misspelling_dict[misspelling] = current_word
                self.misspelling_sources[misspelling].add(source_file)

    def get_edits(self, word: str) -> Set[str]:
        """Get all strings that are one edit distance away from word."""
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        
        return set(deletes + transposes + replaces + inserts)

    def correct(self, word: str) -> str:
        """Return the most probable spelling correction for the word."""
        word = word.lower()
        
        # If word is known, return it
        if word in self.word_counts:
            return word
            
        # If word is a known misspelling, return correction
        if word in self.misspelling_dict:
            return self.misspelling_dict[word]
            
        # Get possible corrections (edit distance 1)
        candidates = self.get_edits(word)
        
        # Filter to known words and sort by frequency
        valid_candidates = [(w, self.word_counts[w]) for w in candidates if w in self.word_counts]
        
        if valid_candidates:
            return max(valid_candidates, key=lambda x: x[1])[0]
            
        # If no valid candidates found with edit distance 1,
        # try edit distance 2
        candidates2 = [e2 for e1 in candidates for e2 in self.get_edits(e1)]
        valid_candidates2 = [(w, self.word_counts[w]) for w in candidates2 if w in self.word_counts]
        
        if valid_candidates2:
            return max(valid_candidates2, key=lambda x: x[1])[0]
            
        return word

def load_and_analyze_corpus_files():
    corpus_files = [
        'birkbeck.dat',
        'holbrook-tagged.dat',
        'aspell.dat',
        'wikipedia.dat'
    ]
    
    corrector = SpellingCorrector()
    
    # Load and process each file
    for filename in corpus_files:
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                print(f"\nLoading {filename}...")
                data = f.read()
                corrector.train_from_corpus(data, filename)
        except Exception as e:
            print(f"Error loading {filename}: {e}")
    
    # Analyze duplicates and sources
    print("\nAnalyzing corpus statistics...")
    
    # Count misspellings by number of sources
    source_counts = Counter(len(sources) for sources in corrector.misspelling_sources.values())
    
    print("\nMisspelling source statistics:")
    for num_sources, count in sorted(source_counts.items()):
        print(f"Misspellings found in {num_sources} file(s): {count}")
    
    # Find misspellings that appear in multiple files
    print("\nSample of misspellings found in multiple files:")
    multi_source_misspellings = [(misspelling, sources) 
                                for misspelling, sources in corrector.misspelling_sources.items() 
                                if len(sources) > 1]
    
    for misspelling, sources in sorted(multi_source_misspellings[:10]):
        correct = corrector.misspelling_dict[misspelling]
        print(f"'{misspelling}' -> '{correct}' (found in: {', '.join(sources)})")
    
    return corrector

# Create and train the corrector
print("Starting spelling corrector training...")
corrector = load_and_analyze_corpus_files()

# Print overall statistics
print(f"\nFinal Statistics:")
print(f"Total unique correct words: {len(corrector.word_counts)}")
print(f"Total unique misspellings: {len(corrector.misspelling_dict)}")

# Test some common misspellings
test_words = [
    "recieve", 
    "seperate", 
    "occured", 
    "accomodate",
    "neccessary",
    "wierd",
    "belive",
    "freind",
    "thier",
    "peice"
]

print("\nTesting corrections:")
for word in test_words:
    correction = corrector.correct(word)
    sources = corrector.misspelling_sources.get(word, set())
    if sources:
        print(f"'{word}' -> '{correction}' (found in: {', '.join(sources)})")
    else:
        print(f"'{word}' -> '{correction}' (not in training data)")

def interactive_testing():
    print("\nEnter words to correct (or 'quit' to exit):")
    while True:
        word = input("> ").strip().lower()
        if word == 'quit':
            break
        correction = corrector.correct(word)
        sources = corrector.misspelling_sources.get(word, set())
        if sources:
            print(f"'{word}' -> '{correction}' (found in: {', '.join(sources)})")
        else:
            print(f"'{word}' -> '{correction}' (not in training data)")

# Ask if user wants to try interactive testing
print("\nWould you like to try interactive testing? (yes/no)")
response = input().strip().lower()
if response.startswith('y'):
    interactive_testing()

Starting spelling corrector training...
Error loading birkbeck.dat: [Errno 2] No such file or directory: 'birkbeck.dat'
Error loading holbrook-tagged.dat: [Errno 2] No such file or directory: 'holbrook-tagged.dat'

Loading aspell.dat...
  Previous: consistent
  New: consonant
  Previous: favor
  New: favour
  Previous: kilometer
  New: kilometre

Loading wikipedia.dat...
  Previous: catalina
  New: catiline
  Previous: liaison
  New: mara liasson
  Previous: adders
  New: address
  Previous: advice
  New: advised
  Previous: a lot
  New: allot
  Previous: again
  New: angina
  Previous: anal
  New: annual
  Previous: appalling
  New: appealing
  Previous: archaeologist
  New: archeologist
  Previous: archaeologists
  New: archeologists
  Previous: archaeology
  New: archeology
  Previous: archaeology
  New: archeology
  Previous: achieve
  New: archive
  Previous: achieved
  New: archived
  Previous: amateur
  New: armature
  Previous: accession
  New: ascension
  Previous: attainder
 