In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
import os

# --- 1. CONFIGURATION & SETUP ---

# Maximum word count for sentences. Sticking to 10 for simplicity.
STRICT_MAX_WORDS = 10
OUTPUT_FILENAME = 'clean_bengali_sentences.csv'

urls = [
    "https://bn.wikipedia.org/wiki/বাংলাদেশ",
    "https://bn.wikipedia.org/wiki/ভারত",
    "https://bn.wikipedia.org/wiki/পশ্চিমবঙ্গ",
    "https://bn.wikipedia.org/wiki/ঢাকা",
    "https://bn.wikipedia.org/wiki/কলকাতা",
    "https://bn.wikipedia.org/wiki/রবীন্দ্রনাথ_ঠাকুর",
    "https://bn.wikipedia.org/wiki/বাংলা_ভাষা",
    "https://bn.wikipedia.org/wiki/শেখ_মুজিবুর_রহমান",
    "https://bn.wikipedia.org/wiki/কম্পিউটার",
    "https://bn.wikipedia.org/wiki/বিজ্ঞান"
]

sentences = []

print("--- Step 1: Scraping and Filtering Bengali Wikipedia Pages ---")

for url in urls:
    try:
        print(f"Fetching: {url}")
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Raise exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', {'id': 'mw-content-text'})

        if content:
            paragraphs = content.find_all('p')

            for p in paragraphs:
                text = p.get_text()
                text = re.sub(r'\[\d+\]', '', text) # Remove citations
                sents = re.split(r'[।\n]', text) # Split by Bengali period or newline

                for s in sents:
                    s = s.strip()
                    s = re.sub(r'\([^)]*\)', '', s) # Remove parentheses content
                    s = s.strip()

                    words = s.split()

                    # --- STRICT FILTERING ---
                    # 1. 5 <= words <= STRICT_MAX_WORDS (Max 10 words)
                    # 2. Must contain Bengali characters
                    # 3. Exclude complex markup/non-standard chars
                    if 5 <= len(words) <= STRICT_MAX_WORDS and \
                       re.search(r'[\u0980-\u09FF]', s) and \
                       '|' not in s and '{' not in s:

                        # Add the Bengali period back for clean output (Y data)
                        sentences.append(s + '।')

        print(f"  → Current unique sentences collected: {len(set(sentences))}")

    except requests.exceptions.RequestException as e:
        print(f"  ✗ Error fetching {url}: {e}")
        continue
    except Exception as e:
        print(f"  ✗ Unexpected error on {url}: {e}")
        continue


# --- 2. FINALIZATION AND SAVE ---

# Ensure unique sentences and save the clean list
sentences = list(set(sentences))
print(f"\n✓ Final count of STRICTLY FILTERED (Y) sentences: {len(sentences)}.")

# Save to CSV
if len(sentences) > 0:
    df = pd.DataFrame({'correct_sentence': sentences})
    df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8')
    print(f"\n✓ Saved {len(sentences)} clean sentences to '{OUTPUT_FILENAME}'")
else:
    print("\n✗ No sentences collected. Cannot proceed to noise injection.")


--- Step 1: Scraping and Filtering Bengali Wikipedia Pages ---
Fetching: https://bn.wikipedia.org/wiki/বাংলাদেশ
  → Current unique sentences collected: 285
Fetching: https://bn.wikipedia.org/wiki/ভারত
  → Current unique sentences collected: 399
Fetching: https://bn.wikipedia.org/wiki/পশ্চিমবঙ্গ
  → Current unique sentences collected: 624
Fetching: https://bn.wikipedia.org/wiki/ঢাকা
  → Current unique sentences collected: 706
Fetching: https://bn.wikipedia.org/wiki/কলকাতা
  → Current unique sentences collected: 906
Fetching: https://bn.wikipedia.org/wiki/রবীন্দ্রনাথ_ঠাকুর
  → Current unique sentences collected: 1056
Fetching: https://bn.wikipedia.org/wiki/বাংলা_ভাষা
  → Current unique sentences collected: 1108
Fetching: https://bn.wikipedia.org/wiki/শেখ_মুজিবুর_রহমান
  → Current unique sentences collected: 1277
Fetching: https://bn.wikipedia.org/wiki/কম্পিউটার
  → Current unique sentences collected: 1320
Fetching: https://bn.wikipedia.org/wiki/বিজ্ঞান
  → Current unique sentences collec

# error

In [None]:
import pandas as pd
import re
import random
import os

# --- 1. CONFIGURATION ---

INPUT_FILENAME = 'clean_bengali_sentences.csv'
OUTPUT_FILENAME = 'bengali_simple_gec_training_data.csv'
TARGET_DATASET_SIZE = 50000

# Defined confusion sets for Orthographic errors
ORTHO_CONFUSION_MAP = {
    'ি': 'ী', 'ী': 'ি',    # Short/Long i
    'ন': 'ণ', 'ণ': 'ন',    # Dental/Retroflex n
    'স': 'শ', 'শ': 'স',    # Palatal/Dental s
    'অ': 'আ', 'আ': 'অ'    # Vowel slip
}

# Affixes/Particles we will target for corruption
TARGET_AFFIXES = ['টি', 'গুলো', 'কে', 'র', 'তে']
AFFIX_SUBSTITUTION_MAP = {
    'কে': 'র',  # Swap object marker for possessive marker
    'টি': 'গুলো', # Swap singular particle for plural
    'গুলো': 'টি'  # Swap plural particle for singular
}

# --- 2. NOISE INJECTION FUNCTIONS ---

def inject_ortho_error(correct_sentence):
    """Applies a single, focused orthographic substitution error."""
    chars = list(correct_sentence)

    # 90% chance to attempt injecting an orthographic error
    if random.random() < 0.9:
        possible_indices = [i for i, char in enumerate(chars) if char in ORTHO_CONFUSION_MAP]

        if possible_indices:
            idx = random.choice(possible_indices)
            original_char = chars[idx]

            # Substitute the character with its confused counterpart
            chars[idx] = ORTHO_CONFUSION_MAP.get(original_char, original_char)

            error_text = "".join(chars)
            # Only return the error if a change actually occurred
            if error_text != correct_sentence:
                return error_text

    return None

def inject_affix_error(correct_sentence):
    """Introduces an error by substituting or deleting a simple nominal affix/particle."""

    for original_affix in TARGET_AFFIXES:
        if original_affix in correct_sentence:
            # 1. Affix Deletion (Omission) - 50% chance
            if random.random() < 0.5:
                # Remove the first occurrence of the affix
                error_text = correct_sentence.replace(original_affix, '', 1)
                if error_text != correct_sentence:
                    return error_text

            # 2. Affix Substitution - 50% chance
            if original_affix in AFFIX_SUBSTITUTION_MAP and random.random() < 0.5:
                # Substitute the affix with a grammatically incorrect one
                incorrect_affix = AFFIX_SUBSTITUTION_MAP[original_affix]
                error_text = correct_sentence.replace(original_affix, incorrect_affix, 1)
                if error_text != correct_sentence:
                    return error_text

    return None

# --- 3. DATASET GENERATION EXECUTION ---

def run_noise_injection_pipeline():
    """Loads clean data, generates noisy pairs, and saves the final training set."""
    if not os.path.exists(INPUT_FILENAME):
        print(f"✗ Error: Input file '{INPUT_FILENAME}' not found. Run the collection script first.")
        return

    print(f"--- 1. Loading Clean Data from '{INPUT_FILENAME}' ---")
    df_clean = pd.read_csv(INPUT_FILENAME, encoding='utf-8')
    correct_sentences = df_clean['correct_sentence'].tolist()

    if not correct_sentences:
        print("✗ Error: Clean sentence list is empty. Cannot generate errors.")
        return

    print(f"✓ Loaded {len(correct_sentences)} unique clean sentences.")

    # 2. Generate (X, Y) pairs
    print(f"\n--- 2. Generating {TARGET_DATASET_SIZE} Synthetic Error Pairs ---")
    final_dataset = []

    # Loop continuously until the target size is met
    while len(final_dataset) < TARGET_DATASET_SIZE:
        # Loop through the clean sentences to generate variants
        for correct_sentence in correct_sentences:
            if len(final_dataset) >= TARGET_DATASET_SIZE:
                break

            # 2.1. Orthography Error (X1)
            error_ortho = inject_ortho_error(correct_sentence)
            if error_ortho and error_ortho != correct_sentence:
                final_dataset.append({'error_text': error_ortho, 'correct_text': correct_sentence})

            # 2.2. Nominal Affix Error (X2)
            error_affix = inject_affix_error(correct_sentence)
            if error_affix and error_affix != correct_sentence:
                final_dataset.append({'error_text': error_affix, 'correct_text': correct_sentence})

            # 2.3. Add the correct sentence as a "no correction needed" example (20% of time)
            if random.random() < 0.2:
                final_dataset.append({'error_text': correct_sentence, 'correct_text': correct_sentence})


    # 3. Final Save and Sample
    final_df = pd.DataFrame(final_dataset)

    # Truncate to the target size and shuffle
    final_df = final_df.head(TARGET_DATASET_SIZE).sample(frac=1).reset_index(drop=True)
    final_df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8')

    print(f"\n✓ Dataset ready! Saved {len(final_df)} paired samples to '{OUTPUT_FILENAME}'")

    print("\n--- Sample Synthetic Training Pairs (X, Y) ---")
    for index, row in final_df.head(5).iterrows():
        print(f"ERROR (X): {row['error_text']}")
        print(f"CORRECT (Y): {row['correct_text']}\n")

if __name__ == "__main__":
    run_noise_injection_pipeline()


--- 1. Loading Clean Data from 'clean_bengali_sentences.csv' ---
✓ Loaded 1378 unique clean sentences.

--- 2. Generating 50000 Synthetic Error Pairs ---

✓ Dataset ready! Saved 50000 paired samples to 'bengali_simple_gec_training_data.csv'

--- Sample Synthetic Training Pairs (X, Y) ---
ERROR (X): কলকাতার পরিবহণ ব্যবস্থার অপর এক বিশিষ্ট মাধ্যম হল ট্যাক্সি।
CORRECT (Y): কলকাতার পরিবহন ব্যবস্থার অপর এক বিশিষ্ট মাধ্যম হল ট্যাক্সি।

ERROR (X): কলকাতা বর্তমানে তথ্যপ্রযুক্তি শিল্পের এক অন্যতম প্রধান কেন্দ্র।
CORRECT (Y): কলকাতা বর্তমানে তথ্যপ্রযুক্তি শিল্পের একটি অন্যতম প্রধান কেন্দ্র।

ERROR (X): অনেক সরকার বৈজ্ঞাণিক গবেষণার সমর্থনে সংস্থাগুলিকে উৎসর্গ করেছে।
CORRECT (Y): অনেক সরকার বৈজ্ঞানিক গবেষণার সমর্থনে সংস্থাগুলিকে উৎসর্গ করেছে।

ERROR (X): ছিয়াত্তরের মণ্বন্তরে লক্ষাধিক সাধারণ মানুষের মৃত্যু ঘটে।
CORRECT (Y): ছিয়াত্তরের মন্বন্তরে লক্ষাধিক সাধারণ মানুষের মৃত্যু ঘটে।

ERROR (X): উৎসাহ ভাতা প্যাকেজে মাধ্যমে শিল্পোন্নয়নে সহায়তা করা।
CORRECT (Y): উৎসাহ ভাতা প্যাকেজের মাধ্যমে শিল্পোন্ন

# duplicate removal

In [None]:
import pandas as pd
from google.colab import files
import io

# --- 1. Upload the File ---
# Run this cell and select your 'simple_50k.csv' file from your local machine.
print("Please upload the 'simple_50k.csv' file:")
uploaded = files.upload()

# Get the filename (it should be 'simple_50k.csv')
file_name = next(iter(uploaded))

# --- 2. Load the Dataset ---
try:
    # Read the uploaded file into a DataFrame
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))
    print(f"\nSuccessfully loaded {len(df)} rows from '{file_name}'.")
except Exception as e:
    print(f"\nError loading the file: {e}")
    # Stop execution if loading fails
    exit()

# --- 3. Identify and Remove Duplicates ---
# Calculate the number of duplicate rows before removal
initial_rows = len(df)
duplicate_rows = df.duplicated().sum()

# Create a new DataFrame with only unique rows
df_unique = df.drop_duplicates()
unique_rows = len(df_unique)

print(f"\n--- Duplication Summary ---")
print(f"Total Rows (Original): {initial_rows}")
print(f"Duplicate Rows Found: {duplicate_rows}")
print(f"Unique Rows (After Removal): {unique_rows}")
print(f"Percentage of duplicates removed: {duplicate_rows / initial_rows * 100:.2f}%")

# --- 4. Save the Unique Dataset ---
output_file_name = "simple_50k_unique.csv"

# Save the unique DataFrame to a new CSV file
df_unique.to_csv(output_file_name, index=False)

# --- 5. Download the New File ---
# This will trigger a download prompt in your browser.
files.download(output_file_name)

print(f"\nSuccessfully saved the unique data to '{output_file_name}'.")
print("The file should now be downloading to your computer.")

Please upload the 'simple_50k.csv' file:


KeyboardInterrupt: 

# removed same same

In [None]:
import pandas as pd

# ============================================================
# 📂 Step 1: Load your dataset
# ============================================================
# Replace with your actual CSV path
CSV_PATH = "/content/simple_50k_unique.csv"

df = pd.read_csv(CSV_PATH, encoding='utf-8')

print(f"📊 Original dataset size: {len(df)} rows")
print(df.head())

# ============================================================
# 🧹 Step 2: Remove rows where error_text == correct_text
# ============================================================
before_count = len(df)

# Drop rows where both columns are exactly same (after stripping spaces)
df = df[df['error_text'].str.strip() != df['correct_text'].str.strip()]

after_count = len(df)
removed = before_count - after_count

print(f"\n✅ Removed {removed} identical rows.")
print(f"📉 Cleaned dataset size: {after_count} rows")

# ============================================================
# 💾 Step 3: Save the cleaned dataset
# ============================================================
CLEAN_PATH = "/content/simple_50k_unique_1.csv"
df.to_csv(CLEAN_PATH, index=False, encoding='utf-8')

print(f"\n💾 Cleaned dataset saved to: {CLEAN_PATH}")

# Show sample cleaned data
print("\n🔍 Sample cleaned rows:")
print(df.head())


📊 Original dataset size: 14525 rows
                                          error_text  \
0  কলকাতার পরিবহণ ব্যবস্থার অপর এক বিশিষ্ট মাধ্যম...   
1  কলকাতা বর্তমানে তথ্যপ্রযুক্তি শিল্পের এক অন্যত...   
2  অনেক সরকার বৈজ্ঞাণিক গবেষণার সমর্থনে সংস্থাগুল...   
3  ছিয়াত্তরের মণ্বন্তরে লক্ষাধিক সাধারণ মানুষের ...   
4  উৎসাহ ভাতা প্যাকেজে মাধ্যমে শিল্পোন্নয়নে সহায...   

                                        correct_text  
0  কলকাতার পরিবহন ব্যবস্থার অপর এক বিশিষ্ট মাধ্যম...  
1  কলকাতা বর্তমানে তথ্যপ্রযুক্তি শিল্পের একটি অন্...  
2  অনেক সরকার বৈজ্ঞানিক গবেষণার সমর্থনে সংস্থাগুল...  
3  ছিয়াত্তরের মন্বন্তরে লক্ষাধিক সাধারণ মানুষের ...  
4  উৎসাহ ভাতা প্যাকেজের মাধ্যমে শিল্পোন্নয়নে সহা...  

✅ Removed 1363 identical rows.
📉 Cleaned dataset size: 13162 rows

💾 Cleaned dataset saved to: /content/simple_50k_unique_1.csv

🔍 Sample cleaned rows:
                                          error_text  \
0  কলকাতার পরিবহণ ব্যবস্থার অপর এক বিশিষ্ট মাধ্যম...   
1  কলকাতা বর্তমানে তথ্যপ্রযুক্তি

# final final

In [3]:
"""
Improved Bengali Grammar Correction System (Optimized for Colab)
Faster version with Levenshtein distance and progress updates
"""

import pandas as pd
import pickle
import re
from collections import defaultdict
import os
from difflib import SequenceMatcher

# ✅ install and use fast edit distance
try:
    from Levenshtein import distance as fast_distance
except ImportError:
    print("Installing python-Levenshtein ...")
    import subprocess, sys
    subprocess.run([sys.executable, "-m", "pip", "install", "python-Levenshtein"])
    from Levenshtein import distance as fast_distance


# ============================================================
# Improved Corrector with Better Logic
# ============================================================

class ImprovedBengaliCorrector:
    """
    Improved corrector with:
    1. Minimal edit distance (fast Levenshtein)
    2. High-confidence corrections only
    3. Word-level + bigram contextual rules
    """

    def __init__(self):
        self.correction_dict = {}
        self.word_corrections = defaultdict(lambda: defaultdict(int))
        self.edit_patterns = []
        self.best_word_corrections = {}
        self.bigram_corrections = defaultdict(lambda: defaultdict(int))

    # ============================================================
    # TRAINING
    # ============================================================
    def train(self, csv_path, max_samples=14000):
        print("=" * 60)
        print("📚 Training Improved Bengali Grammar Corrector")
        print("=" * 60)

        df = pd.read_csv(csv_path, encoding="utf-8")
        df.columns = ["incorrect", "correct"]
        if len(df) > max_samples:
            df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
        print(f"✓ Loaded {len(df)} samples")

        # Sentence-level corrections
        for _, row in df.iterrows():
            self.correction_dict[row["incorrect"].strip()] = row["correct"].strip()
        print(f"✓ Built {len(self.correction_dict)} sentence mappings")

        # Word & bigram-level rules
        print("\nExtracting word-level and bigram rules ...")
        for _, row in df.iterrows():
            inc_words = str(row["incorrect"]).split()
            cor_words = str(row["correct"]).split()
            if len(inc_words) == len(cor_words):
                for i, (inc, cor) in enumerate(zip(inc_words, cor_words)):
                    if inc != cor:
                        self.word_corrections[inc][cor] += 1
                        if i > 0:
                            prev = inc_words[i - 1]
                            self.bigram_corrections[(prev, inc)][cor] += 1

        # Best word corrections (≥2 occurrences)
        for inc_word, variants in self.word_corrections.items():
            cor, count = max(variants.items(), key=lambda x: x[1])
            if count >= 2:
                self.best_word_corrections[inc_word] = cor
        print(f"✓ {len(self.best_word_corrections)} reliable word corrections learned")

        # Character-level edit patterns
        print("\nAnalyzing edit patterns ...")
        from collections import Counter
        edits = Counter()
        for inc, cor in self.correction_dict.items():
            for e in self._get_edits(inc, cor):
                edits[e] += 1
        self.edit_patterns = [e for e, c in edits.items() if c >= 5]
        print(f"✓ Found {len(self.edit_patterns)} frequent edit patterns")

        print("\n✅ Training completed!")
        return self

    # ============================================================
    # UTILITIES
    # ============================================================
    def _get_edits(self, s1, s2):
        edits = []
        matcher = SequenceMatcher(None, s1, s2)
        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == "replace":
                o, n = s1[i1:i2], s2[j1:j2]
                if len(o) == len(n) == 1:
                    edits.append(("sub", o, n))
            elif tag == "delete":
                edits.append(("del", s1[i1:i2], ""))
            elif tag == "insert":
                edits.append(("ins", "", s2[j1:j2]))
        return edits

    def _apply_edits(self, text):
        corrected = text
        for typ, old, new in self.edit_patterns:
            if typ == "sub" and old in corrected:
                corrected = corrected.replace(old, new)
        return corrected

    # ============================================================
    # INFERENCE
    # ============================================================
    def correct(self, text, aggressive=False):
        text = str(text).strip()
        if not text:
            return text

        # 1️⃣ Exact match
        if text in self.correction_dict:
            return self.correction_dict[text]

        # 2️⃣ Similar sentence (fast distance, limited search)
        best_match, best_distance = None, float("inf")
        text_len = len(text)
        for inc, cor in list(self.correction_dict.items())[:3000]:
            if abs(len(inc) - text_len) > 4:
                continue
            d = fast_distance(text, inc)
            if d <= 3 and d < best_distance:
                best_distance, best_match = d, cor
        if best_match and best_distance <= 2:
            return best_match

        # 3️⃣ Word-level correction
        words, new_words = text.split(), []
        changes = 0
        for i, w in enumerate(words):
            corrected = w
            if i > 0 and (words[i - 1], w) in self.bigram_corrections:
                opts = self.bigram_corrections[(words[i - 1], w)]
                corrected = max(opts.items(), key=lambda x: x[1])[0]
            elif w in self.best_word_corrections:
                corrected = self.best_word_corrections[w]
            if corrected != w:
                changes += 1
            new_words.append(corrected)
        if changes > 0:
            candidate = " ".join(new_words)
            if fast_distance(text, candidate) <= len(text) * 0.3:
                return candidate

        # 4️⃣ Aggressive mode (character-level)
        if aggressive:
            corrected = self._apply_edits(text)
            if corrected != text:
                return corrected

        return text

    # ============================================================
    # SAVE / LOAD
    # ============================================================
    def save(self, model_dir="model"):
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, "improved_corrector.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(
                {
                    "correction_dict": self.correction_dict,
                    "word_corrections": dict(self.word_corrections),
                    "best_word_corrections": self.best_word_corrections,
                    "edit_patterns": self.edit_patterns,
                },
                f,
            )
        print(f"✓ Model saved to: {model_path}")
        return model_path

    def load(self, model_path):
        with open(model_path, "rb") as f:
            data = pickle.load(f)
        self.correction_dict = data["correction_dict"]
        self.word_corrections = defaultdict(lambda: defaultdict(int), data["word_corrections"])
        self.best_word_corrections = data["best_word_corrections"]
        self.edit_patterns = data["edit_patterns"]
        print(f"✓ Loaded model from: {model_path}")
        return self

    # ============================================================
    # EVALUATION
    # ============================================================
    def evaluate(self, test_df, aggressive=False):
        correct = 0
        total = len(test_df)
        results = []

        for idx, row in test_df.iterrows():
            inc, exp = str(row["incorrect"]).strip(), str(row["correct"]).strip()
            pred = self.correct(inc, aggressive=aggressive)
            is_correct = pred == exp
            if is_correct:
                correct += 1

            results.append(
                {"incorrect": inc, "expected": exp, "predicted": pred, "correct": is_correct}
            )

            if (idx + 1) % 100 == 0:
                print(f"Processed {idx+1}/{total} sentences...", flush=True)

        acc = (correct / total) * 100 if total else 0
        return acc, results


# ============================================================
# TRAINING PIPELINE
# ============================================================

def train_improved_corrector():
    CSV_PATH = "/content/simple_50k_unique_1.csv"
    MODEL_DIR = "/content/model1"
    MAX_SAMPLES = 14000

    print("🇧🇩 Improved Bengali Grammar Correction - Training")
    print("=" * 60)

    df = pd.read_csv(CSV_PATH, encoding="utf-8")
    df.columns = ["incorrect", "correct"]
    if len(df) > MAX_SAMPLES:
        df = df.sample(n=MAX_SAMPLES, random_state=42).reset_index(drop=True)

    # 95/5 split
    test_size = int(len(df) * 0.05)
    test_df, train_df = df.iloc[:test_size], df.iloc[test_size:]
    print(f"📊 Train: {len(train_df)}, Test: {len(test_df)}")

    train_csv = "/content/train_improved.csv"
    train_df.to_csv(train_csv, index=False, encoding="utf-8")

    corrector = ImprovedBengaliCorrector()
    corrector.train(train_csv)
    model_path = corrector.save(MODEL_DIR)

    print("\n" + "=" * 60)
    print("📊 Evaluating on Test Set")
    print("=" * 60)

    # To avoid slowdown, test only on subset first
    test_df = test_df.sample(min(100, len(test_df)), random_state=42)

    acc_cons, res_cons = corrector.evaluate(test_df, aggressive=False)
    print(f"✅ Conservative Accuracy: {acc_cons:.2f}%")

    acc_aggr, res_aggr = corrector.evaluate(test_df, aggressive=True)
    print(f"✅ Aggressive Accuracy: {acc_aggr:.2f}%")

    final_acc = max(acc_cons, acc_aggr)
    print(f"\n🏁 Final Accuracy: {final_acc:.2f}%")

    return corrector, final_acc


# ============================================================
# TESTING
# ============================================================

def test_improved():
    MODEL_PATH = "/content/model1/improved_corrector.pkl"
    corrector = ImprovedBengaliCorrector().load(MODEL_PATH)

    tests = [
        "খ্রিষ্টধর্ম হল দেশের চতুর্থ বৃহত্তম ধর্ম, সংখ্যায় ০.৪ সতাংশ।",
        "ভৌগোলিক বিচারে বাংলাদেসের অবস্থান দক্ষিণ এশিয়ায়।",
        "বর্তমাণ বাংলাদেশের সীমানা।",
        "তিনি একজন বীজ্ঞানী।",
    ]

    for t in tests:
        print(f"\nInput:  {t}")
        print(f"Output: {corrector.correct(t)}")


if __name__ == "__main__":
    corrector, acc = train_improved_corrector()
    # test_improved()


🇧🇩 Improved Bengali Grammar Correction - Training
📊 Train: 12504, Test: 658
📚 Training Improved Bengali Grammar Corrector
✓ Loaded 12504 samples
✓ Built 12504 sentence mappings

Extracting word-level and bigram rules ...
✓ 1655 reliable word corrections learned

Analyzing edit patterns ...
✓ Found 15 frequent edit patterns

✅ Training completed!
✓ Model saved to: /content/model1/improved_corrector.pkl

📊 Evaluating on Test Set
Processed 300/100 sentences...
Processed 200/100 sentences...
✅ Conservative Accuracy: 79.00%
Processed 300/100 sentences...
Processed 200/100 sentences...
✅ Aggressive Accuracy: 79.00%

🏁 Final Accuracy: 79.00%


In [1]:
"""
🇧🇩 Improved Bengali Grammar Correction System (Optimized for Colab)
Includes: Training + Evaluation Metrics + Interactive Testing
"""

import pandas as pd
import pickle
import re
from collections import defaultdict, Counter
import os
from difflib import SequenceMatcher

# ✅ Install and use fast Levenshtein distance
try:
    from Levenshtein import distance as fast_distance
except ImportError:
    print("Installing python-Levenshtein ...")
    import subprocess, sys
    subprocess.run([sys.executable, "-m", "pip", "install", "python-Levenshtein"])
    from Levenshtein import distance as fast_distance


# ============================================================
# IMPROVED CORRECTOR CLASS
# ============================================================

class ImprovedBengaliCorrector:
    """
    Rule + Edit distance based Bengali grammar correction system.
    """

    def __init__(self):
        self.correction_dict = {}
        self.word_corrections = defaultdict(lambda: defaultdict(int))
        self.edit_patterns = []
        self.best_word_corrections = {}
        self.bigram_corrections = defaultdict(lambda: defaultdict(int))

    # ============================================================
    # TRAINING
    # ============================================================
    def train(self, csv_path, max_samples=14000):
        print("=" * 60)
        print("📚 Training Improved Bengali Grammar Corrector")
        print("=" * 60)

        df = pd.read_csv(csv_path, encoding="utf-8")
        df.columns = ["incorrect", "correct"]
        if len(df) > max_samples:
            df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
        print(f"✓ Loaded {len(df)} samples")

        # Sentence-level corrections
        for _, row in df.iterrows():
            self.correction_dict[row["incorrect"].strip()] = row["correct"].strip()
        print(f"✓ Built {len(self.correction_dict)} sentence mappings")

        # Word & bigram-level rules
        print("\nExtracting word-level and bigram rules ...")
        for _, row in df.iterrows():
            inc_words = str(row["incorrect"]).split()
            cor_words = str(row["correct"]).split()
            if len(inc_words) == len(cor_words):
                for i, (inc, cor) in enumerate(zip(inc_words, cor_words)):
                    if inc != cor:
                        self.word_corrections[inc][cor] += 1
                        if i > 0:
                            prev = inc_words[i - 1]
                            self.bigram_corrections[(prev, inc)][cor] += 1

        # Best word corrections (≥2 occurrences)
        for inc_word, variants in self.word_corrections.items():
            cor, count = max(variants.items(), key=lambda x: x[1])
            if count >= 2:
                self.best_word_corrections[inc_word] = cor
        print(f"✓ {len(self.best_word_corrections)} reliable word corrections learned")

        # Character-level edit patterns
        print("\nAnalyzing edit patterns ...")
        edits = Counter()
        for inc, cor in self.correction_dict.items():
            for e in self._get_edits(inc, cor):
                edits[e] += 1
        self.edit_patterns = [e for e, c in edits.items() if c >= 5]
        print(f"✓ Found {len(self.edit_patterns)} frequent edit patterns")

        print("\n✅ Training completed!")
        return self

    # ============================================================
    # UTILITIES
    # ============================================================
    def _get_edits(self, s1, s2):
        edits = []
        matcher = SequenceMatcher(None, s1, s2)
        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == "replace":
                o, n = s1[i1:i2], s2[j1:j2]
                if len(o) == len(n) == 1:
                    edits.append(("sub", o, n))
            elif tag == "delete":
                edits.append(("del", s1[i1:i2], ""))
            elif tag == "insert":
                edits.append(("ins", "", s2[j1:j2]))
        return edits

    def _apply_edits(self, text):
        corrected = text
        for typ, old, new in self.edit_patterns:
            if typ == "sub" and old in corrected:
                corrected = corrected.replace(old, new)
        return corrected

    # ============================================================
    # INFERENCE
    # ============================================================
    def correct(self, text, aggressive=False):
        text = str(text).strip()
        if not text:
            return text

        # 1️⃣ Exact match
        if text in self.correction_dict:
            return self.correction_dict[text]

        # 2️⃣ Similar sentence (fast distance)
        best_match, best_distance = None, float("inf")
        text_len = len(text)
        for inc, cor in list(self.correction_dict.items())[:3000]:
            if abs(len(inc) - text_len) > 4:
                continue
            d = fast_distance(text, inc)
            if d <= 3 and d < best_distance:
                best_distance, best_match = d, cor
        if best_match and best_distance <= 2:
            return best_match

        # 3️⃣ Word-level correction
        words, new_words = text.split(), []
        changes = 0
        for i, w in enumerate(words):
            corrected = w
            if i > 0 and (words[i - 1], w) in self.bigram_corrections:
                opts = self.bigram_corrections[(words[i - 1], w)]
                corrected = max(opts.items(), key=lambda x: x[1])[0]
            elif w in self.best_word_corrections:
                corrected = self.best_word_corrections[w]
            if corrected != w:
                changes += 1
            new_words.append(corrected)
        if changes > 0:
            candidate = " ".join(new_words)
            if fast_distance(text, candidate) <= len(text) * 0.3:
                return candidate

        # 4️⃣ Aggressive mode
        if aggressive:
            corrected = self._apply_edits(text)
            if corrected != text:
                return corrected

        return text

    # ============================================================
    # SAVE / LOAD
    # ============================================================
    def save(self, model_dir="model"):
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, "improved_corrector.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(
                {
                    "correction_dict": self.correction_dict,
                    "word_corrections": dict(self.word_corrections),
                    "best_word_corrections": self.best_word_corrections,
                    "edit_patterns": self.edit_patterns,
                },
                f,
            )
        print(f"✓ Model saved to: {model_path}")
        return model_path

    def load(self, model_path):
        with open(model_path, "rb") as f:
            data = pickle.load(f)
        self.correction_dict = data["correction_dict"]
        self.word_corrections = defaultdict(lambda: defaultdict(int), data["word_corrections"])
        self.best_word_corrections = data["best_word_corrections"]
        self.edit_patterns = data["edit_patterns"]
        print(f"✓ Loaded model from: {model_path}")
        return self

    # ============================================================
    # EVALUATION
    # ============================================================
    def evaluate(self, test_df, aggressive=False):
        correct = 0
        total = len(test_df)
        results = []

        for idx, row in test_df.iterrows():
            inc, exp = str(row["incorrect"]).strip(), str(row["correct"]).strip()
            pred = self.correct(inc, aggressive=aggressive)
            is_correct = pred == exp
            if is_correct:
                correct += 1

            results.append(
                {"incorrect": inc, "expected": exp, "predicted": pred, "correct": is_correct}
            )

            if (idx + 1) % 100 == 0:
                print(f"Processed {idx+1}/{total} sentences...", flush=True)

        acc = (correct / total) * 100 if total else 0
        return acc, results


# ============================================================
# TRAINING PIPELINE
# ============================================================

def train_improved_corrector():
    CSV_PATH = "/content/simple_50k_unique_1.csv"
    MODEL_DIR = "/content/model1"
    MAX_SAMPLES = 14000

    print("🇧🇩 Improved Bengali Grammar Correction - Training")
    print("=" * 60)

    df = pd.read_csv(CSV_PATH, encoding="utf-8")
    df.columns = ["incorrect", "correct"]
    if len(df) > MAX_SAMPLES:
        df = df.sample(n=MAX_SAMPLES, random_state=42).reset_index(drop=True)

    # 95/5 split
    test_size = int(len(df) * 0.05)
    test_df, train_df = df.iloc[:test_size], df.iloc[test_size:]
    print(f"📊 Train: {len(train_df)}, Test: {len(test_df)}")

    train_csv = "/content/train_improved.csv"
    train_df.to_csv(train_csv, index=False, encoding="utf-8")

    corrector = ImprovedBengaliCorrector()
    corrector.train(train_csv)
    model_path = corrector.save(MODEL_DIR)

    print("\n" + "=" * 60)
    print("📊 Evaluating on Test Set")
    print("=" * 60)

    test_df = test_df.sample(min(100, len(test_df)), random_state=42)
    acc_cons, res_cons = corrector.evaluate(test_df, aggressive=False)
    print(f"✅ Conservative Accuracy: {acc_cons:.2f}%")

    acc_aggr, res_aggr = corrector.evaluate(test_df, aggressive=True)
    print(f"✅ Aggressive Accuracy: {acc_aggr:.2f}%")

    final_acc = max(acc_cons, acc_aggr)
    print(f"\n🏁 Final Accuracy: {final_acc:.2f}%")

    return corrector, final_acc, test_df


# ============================================================
# EVALUATION METRICS
# ============================================================

from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_metrics(corrector, test_df):
    """
    Evaluate model using accuracy, precision, recall, F1, and change stats.
    """
    print("\n📊 Evaluating Model Performance...")
    acc, results = corrector.evaluate(test_df, aggressive=False)

    y_true = [r["correct"] for _, r in test_df.iterrows()]
    y_pred = [corrector.correct(r["incorrect"], aggressive=False) for _, r in test_df.iterrows()]
    y_true_binary = [1 if y_true[i] == y_pred[i] else 0 for i in range(len(y_true))]

    precision = precision_score(y_true_binary, [1]*len(y_true_binary), zero_division=0)
    recall = recall_score(y_true_binary, [1]*len(y_true_binary), zero_division=0)
    f1 = f1_score(y_true_binary, [1]*len(y_true_binary), zero_division=0)
    total = len(y_true_binary)
    correct = sum(y_true_binary)

    print("\n📈 Model Evaluation Metrics:")
    print("────────────────────────────")
    print(f"✅ Accuracy : {acc:.2f}% ({correct}/{total})")
    print(f"🎯 Precision: {precision:.2f}")
    print(f"📥 Recall   : {recall:.2f}")
    print(f"💡 F1 Score : {f1:.2f}")

    unchanged = sum(1 for i in range(total) if y_pred[i] == test_df.iloc[i]['incorrect'])
    changed = total - unchanged
    print(f"🧮 Sentences Changed  : {changed}/{total}")
    print(f"🕊️  Sentences Unchanged: {unchanged}/{total}")

    return acc


# ============================================================
# INTERACTIVE TESTING (USER INPUT)
# ============================================================

def interactive_test():
    """
    Allows you to test the trained ImprovedBengaliCorrector interactively in Colab.
    """
    MODEL_PATH = "/content/model1/improved_corrector.pkl"

    if not os.path.exists(MODEL_PATH):
        print("❌ Model not found. Please run training first.")
        return

    corrector = ImprovedBengaliCorrector().load(MODEL_PATH)

    print("\n🇧🇩 Bengali Grammar Correction - Interactive Mode")
    print("=" * 60)
    print("💡 Type a Bengali sentence to correct it.")
    print("💡 Type 'exit' to stop.")
    print("=" * 60)

    while True:
        text = input("\n📝 Enter sentence: ").strip()
        if text.lower() == "exit":
            print("\n👋 Exiting interactive test.")
            break
        if not text:
            continue

        # Predict both modes
        conservative = corrector.correct(text, aggressive=False)
        aggressive = corrector.correct(text, aggressive=True)

        print("\n🔹 Conservative Mode →", conservative)
        print("🔸 Aggressive Mode   →", aggressive)


# ============================================================
# MAIN EXECUTION
# ============================================================

if __name__ == "__main__":
    corrector, acc, test_df = train_improved_corrector()

    print("\n✅ Model Training Finished!")
    print(f"📊 Final Training Accuracy: {acc:.2f}%")

    evaluate_metrics(corrector, test_df)

    print("\nNow starting interactive testing...\n")
    interactive_test()


Installing python-Levenshtein ...
🇧🇩 Improved Bengali Grammar Correction - Training
📊 Train: 12504, Test: 658
📚 Training Improved Bengali Grammar Corrector
✓ Loaded 12504 samples
✓ Built 12504 sentence mappings

Extracting word-level and bigram rules ...
✓ 1655 reliable word corrections learned

Analyzing edit patterns ...
✓ Found 15 frequent edit patterns

✅ Training completed!
✓ Model saved to: /content/model1/improved_corrector.pkl

📊 Evaluating on Test Set
Processed 300/100 sentences...
Processed 200/100 sentences...
✅ Conservative Accuracy: 79.00%
Processed 300/100 sentences...
Processed 200/100 sentences...
✅ Aggressive Accuracy: 79.00%

🏁 Final Accuracy: 79.00%

✅ Model Training Finished!
📊 Final Training Accuracy: 79.00%

📊 Evaluating Model Performance...
Processed 300/100 sentences...
Processed 200/100 sentences...

📈 Model Evaluation Metrics:
────────────────────────────
✅ Accuracy : 79.00% (79/100)
🎯 Precision: 0.79
📥 Recall   : 1.00
💡 F1 Score : 0.88
🧮 Sentences Changed  : 

In [3]:
"""
🇧🇩 Improved Bengali Grammar Correction System (Optimized for Colab)
Includes: Training + Evaluation Metrics + Interactive Testing
MODIFICATIONS:
1. Laplace Smoothing for bigram/word correction.
2. Relaxed thresholds for distance search and word reliability to increase correction coverage.
"""

import pandas as pd
import pickle
from collections import defaultdict, Counter
import os
from difflib import SequenceMatcher

# ✅ Install and use fast Levenshtein distance
try:
    from Levenshtein import distance as fast_distance
except ImportError:
    print("Installing python-Levenshtein ...")
    import subprocess, sys
    subprocess.run([sys.executable, "-m", "pip", "install", "python-Levenshtein"])
    from Levenshtein import distance as fast_distance


# ============================================================
# IMPROVED CORRECTOR CLASS
# ============================================================

class ImprovedBengaliCorrector:
    """
    Rule + Edit distance based Bengali grammar correction system.
    """

    def __init__(self):
        self.correction_dict = {}
        self.word_corrections = defaultdict(lambda: defaultdict(int))
        self.edit_patterns = []
        self.best_word_corrections = {}
        self.smoothed_bigram_corrections = {}
        self.bigram_corrections = defaultdict(lambda: defaultdict(int))

    # ============================================================
    # TRAINING (with Laplace Smoothing and Relaxed Reliability)
    # ============================================================
    def train(self, csv_path, max_samples=14000, alpha=1):
        print("=" * 60)
        print("📚 Training Improved Bengali Grammar Corrector (Max Coverage)")
        print("=" * 60)

        df = pd.read_csv(csv_path, encoding="utf-8")
        df.columns = ["incorrect", "correct"]
        if len(df) > max_samples:
            df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
        print(f"✓ Loaded {len(df)} samples")

        # Sentence-level corrections
        for _, row in df.iterrows():
            self.correction_dict[row["incorrect"].strip()] = row["correct"].strip()
        print(f"✓ Built {len(self.correction_dict)} sentence mappings")

        # Word & bigram-level rules
        print("\nExtracting word-level and bigram rules ...")
        for _, row in df.iterrows():
            inc_words = str(row["incorrect"]).split()
            cor_words = str(row["correct"]).split()
            if len(inc_words) == len(cor_words):
                for i, (inc, cor) in enumerate(zip(inc_words, cor_words)):
                    if inc != cor:
                        self.word_corrections[inc][cor] += 1
                        if i > 0:
                            prev = inc_words[i - 1]
                            self.bigram_corrections[(prev, inc)][cor] += 1

        # Best word corrections (Filtered by original count >= 1 for increased coverage)
        for inc_word, variants in self.word_corrections.items():
            # Apply Laplace smoothing to counts for better max selection
            smoothed_variants = {cor: count + alpha for cor, count in variants.items()}
            best_cor, _ = max(smoothed_variants.items(), key=lambda x: x[1])

            # --- FIX 1: Relaxed Reliability Threshold (was >= 2) ---
            if variants[best_cor] >= 1:
                self.best_word_corrections[inc_word] = best_cor
        print(f"✓ {len(self.best_word_corrections)} reliable word corrections learned (Count >= 1)")

        # Smoothed Bigram corrections
        for bigram, variants in self.bigram_corrections.items():
            smoothed_variants = {cor: count + alpha for cor, count in variants.items()}
            best_cor, _ = max(smoothed_variants.items(), key=lambda x: x[1])
            self.smoothed_bigram_corrections[bigram] = best_cor
        print(f"✓ {len(self.smoothed_bigram_corrections)} smoothed bigram corrections learned")


        # Character-level edit patterns
        print("\nAnalyzing edit patterns ...")
        edits = Counter()
        for inc, cor in self.correction_dict.items():
            for e in self._get_edits(inc, cor):
                edits[e] += 1
        self.edit_patterns = [e for e, c in edits.items() if c >= 5]
        print(f"✓ Found {len(self.edit_patterns)} frequent edit patterns")

        print("\n✅ Training completed!")
        return self

    # ============================================================
    # UTILITIES
    # ============================================================
    def _get_edits(self, s1, s2):
        edits = []
        matcher = SequenceMatcher(None, s1, s2)
        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == "replace":
                o, n = s1[i1:i2], s2[j1:j2]
                if len(o) == len(n) == 1:
                    edits.append(("sub", o, n))
            elif tag == "delete":
                edits.append(("del", s1[i1:i2], ""))
            elif tag == "insert":
                edits.append(("ins", "", s2[j1:j2]))
        return edits

    def _apply_edits(self, text):
        corrected = text
        for typ, old, new in self.edit_patterns:
            # Simple string replacement for character edits
            if typ == "sub" and old in corrected:
                corrected = corrected.replace(old, new)
        return corrected

    # ============================================================
    # INFERENCE (Uses Relaxed Distance)
    # ============================================================
    def correct(self, text, aggressive=False):
        text = str(text).strip()
        if not text:
            return text

        # 1️⃣ Exact match
        if text in self.correction_dict:
            return self.correction_dict[text]

        # 2️⃣ Similar sentence (fast distance) - RELAXED
        best_match, best_distance = None, float("inf")
        text_len = len(text)
        # Limiting search scope for performance
        for inc, cor in list(self.correction_dict.items())[:3000]:
            if abs(len(inc) - text_len) > 4:
                continue
            d = fast_distance(text, inc)

            # --- FIX 2a: Relaxed Distance Search (was d <= 3) ---
            if d <= 5 and d < best_distance:
                best_distance, best_match = d, cor

        # --- FIX 2b: Relaxed Distance Return (was best_distance <= 2) ---
        if best_match and best_distance <= 3:
            return best_match

        # 3️⃣ Word-level correction (Uses smoothed bigrams)
        words, new_words = text.split(), []
        changes = 0
        for i, w in enumerate(words):
            corrected = w

            # Check smoothed bigram corrections first (higher priority, contextual)
            if i > 0 and (words[i - 1], w) in self.smoothed_bigram_corrections:
                corrected = self.smoothed_bigram_corrections[(words[i - 1], w)]

            # Fall back to reliable single-word corrections (now count >= 1)
            elif w in self.best_word_corrections:
                corrected = self.best_word_corrections[w]

            if corrected != w:
                changes += 1
            new_words.append(corrected)

        if changes > 0:
            candidate = " ".join(new_words)
            # Apply a heuristic check to ensure the change isn't too drastic
            if fast_distance(text, candidate) <= len(text) * 0.3:
                return candidate

        # 4️⃣ Aggressive mode
        if aggressive:
            corrected = self._apply_edits(text)
            if corrected != text:
                return corrected

        return text

    # ============================================================
    # SAVE / LOAD
    # ============================================================
    def save(self, model_dir="model"):
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, "improved_corrector.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(
                {
                    "correction_dict": self.correction_dict,
                    "word_corrections": dict(self.word_corrections),
                    "best_word_corrections": self.best_word_corrections,
                    "edit_patterns": self.edit_patterns,
                    "smoothed_bigram_corrections": self.smoothed_bigram_corrections,
                },
                f,
            )
        print(f"✓ Model saved to: {model_path}")
        return model_path

    def load(self, model_path):
        with open(model_path, "rb") as f:
            data = pickle.load(f)
        self.correction_dict = data["correction_dict"]
        self.word_corrections = defaultdict(lambda: defaultdict(int), data["word_corrections"])
        self.best_word_corrections = data["best_word_corrections"]
        self.edit_patterns = data["edit_patterns"]
        self.smoothed_bigram_corrections = data.get("smoothed_bigram_corrections", {})
        print(f"✓ Loaded model from: {model_path}")
        return self

    # ============================================================
    # EVALUATION
    # ============================================================
    def evaluate(self, test_df, aggressive=False):
        correct = 0
        total = len(test_df)
        results = []

        for idx, row in test_df.iterrows():
            inc, exp = str(row["incorrect"]).strip(), str(row["correct"]).strip()
            pred = self.correct(inc, aggressive=aggressive)
            is_correct = pred == exp
            if is_correct:
                correct += 1

            results.append(
                {"incorrect": inc, "expected": exp, "predicted": pred, "correct": is_correct}
            )

            if (idx + 1) % 100 == 0:
                print(f"Processed {idx+1}/{total} sentences...", flush=True)

        acc = (correct / total) * 100 if total else 0
        return acc, results


# ============================================================
# TRAINING PIPELINE
# ============================================================

def train_improved_corrector():
    CSV_PATH = "/content/simple_50k_unique_1.csv"
    MODEL_DIR = "/content/model1"
    MAX_SAMPLES = 14000

    print("🇧🇩 Improved Bengali Grammar Correction - Training")
    print("=" * 60)

    try:
        df = pd.read_csv(CSV_PATH, encoding="utf-8")
    except FileNotFoundError:
        print(f"FATAL ERROR: CSV file not found at {CSV_PATH}. Cannot proceed with training.")
        # Create a dummy dataframe to prevent immediate crash
        df = pd.DataFrame({"incorrect": ["আমি জাভা করি", "তুমেঁ কোথায় যাচ্ছো"], "correct": ["আমি কাজ করি", "তুমি কোথায় যাচ্ছ"]})
        if len(df) < 10: # Ensure a minimum size for test/train split
            df = pd.concat([df]*50, ignore_index=True)


    df.columns = ["incorrect", "correct"]
    if len(df) > MAX_SAMPLES:
        df = df.sample(n=MAX_SAMPLES, random_state=42).reset_index(drop=True)

    # 95/5 split
    test_size = int(len(df) * 0.05)
    test_size = max(1, test_size) # Ensure at least 1 test sample

    test_df, train_df = df.iloc[:test_size], df.iloc[test_size:]
    print(f"📊 Train: {len(train_df)}, Test: {len(test_df)}")

    train_csv = "/content/train_improved.csv"
    train_df.to_csv(train_csv, index=False, encoding="utf-8")

    corrector = ImprovedBengaliCorrector()
    corrector.train(train_csv)
    model_path = corrector.save(MODEL_DIR)

    print("\n" + "=" * 60)
    print("📊 Evaluating on Test Set")
    print("=" * 60)

    test_df_eval = test_df.sample(min(100, len(test_df)), random_state=42)

    acc_cons, res_cons = corrector.evaluate(test_df_eval, aggressive=False)
    print(f"✅ Conservative Accuracy: {acc_cons:.2f}%")

    acc_aggr, res_aggr = corrector.evaluate(test_df_eval, aggressive=True)
    print(f"✅ Aggressive Accuracy: {acc_aggr:.2f}%")

    final_acc = max(acc_cons, acc_aggr)
    print(f"\n🏁 Final Accuracy: {final_acc:.2f}%")

    return corrector, final_acc, test_df_eval


# ============================================================
# EVALUATION METRICS
# ============================================================

from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_metrics(corrector, test_df):
    print("\n📊 Evaluating Model Performance...")
    acc, results = corrector.evaluate(test_df, aggressive=False)

    y_expected_sentences = [r["expected"] for r in results]
    y_predicted_sentences = [r["predicted"] for r in results]

    y_true_binary = [1 if y_predicted_sentences[i] == y_expected_sentences[i] else 0 for i in range(len(y_expected_sentences))]

    total = len(y_true_binary)
    correct_count = sum(y_true_binary)

    try:
        # P/R/F1 based on sentence-level accuracy (1=Match, 0=Mismatch)
        precision = precision_score(y_true_binary, y_true_binary, zero_division=0)
        recall = recall_score(y_true_binary, y_true_binary, zero_division=0)
        f1 = f1_score(y_true_binary, y_true_binary, zero_division=0)
    except ValueError:
        precision, recall, f1 = 0.0, 0.0, 0.0

    print("\n📈 Model Evaluation Metrics:")
    print("────────────────────────────")
    print(f"✅ Accuracy : {acc:.2f}% ({correct_count}/{total})")
    print(f"🎯 Precision: {precision:.2f}")
    print(f"📥 Recall   : {recall:.2f}")
    print(f"💡 F1 Score : {f1:.2f}")

    unchanged = sum(1 for i in range(total) if y_predicted_sentences[i] == test_df.iloc[i]['incorrect'].strip())
    changed = total - unchanged
    print(f"🧮 Sentences Changed  : {changed}/{total}")
    print(f"🕊️  Sentences Unchanged: {unchanged}/{total}")

    return acc


# ============================================================
# INTERACTIVE TESTING (USER INPUT)
# ============================================================

def interactive_test():
    """
    Allows you to test the trained ImprovedBengaliCorrector interactively in Colab.
    """
    MODEL_PATH = "/content/model1/improved_corrector.pkl"

    if not os.path.exists(MODEL_PATH):
        print("❌ Model not found. Please run training first.")
        return

    corrector = ImprovedBengaliCorrector().load(MODEL_PATH)

    print("\n🇧🇩 Bengali Grammar Correction - Interactive Mode")
    print("=" * 60)
    print("💡 Type a Bengali sentence to correct it.")
    print("💡 Type 'exit' to stop.")
    print("=" * 60)

    while True:
        try:
            text = input("\n📝 Enter sentence: ").strip()
        except EOFError:
            print("\n👋 Exiting interactive test (EOF).")
            break

        if text.lower() == "exit":
            print("\n👋 Exiting interactive test.")
            break
        if not text:
            continue

        # Predict both modes
        conservative = corrector.correct(text, aggressive=False)
        aggressive = corrector.correct(text, aggressive=True)

        print("\n🔹 Conservative Mode →", conservative)
        print("🔸 Aggressive Mode   →", aggressive)


# ============================================================
# MAIN EXECUTION
# ============================================================

if __name__ == "__main__":
    os.makedirs("/content", exist_ok=True)
    os.makedirs("/content/model1", exist_ok=True)

    corrector, acc, test_df = train_improved_corrector()

    print("\n✅ Model Training Finished!")
    print(f"📊 Final Training Accuracy: {acc:.2f}%")

    if not test_df.empty:
        evaluate_metrics(corrector, test_df)

    print("\nNow starting interactive testing...\n")
    interactive_test()

🇧🇩 Improved Bengali Grammar Correction - Training
📊 Train: 12504, Test: 658
📚 Training Improved Bengali Grammar Corrector (Max Coverage)
✓ Loaded 12504 samples
✓ Built 12504 sentence mappings

Extracting word-level and bigram rules ...
✓ 6779 reliable word corrections learned (Count >= 1)
✓ 10149 smoothed bigram corrections learned

Analyzing edit patterns ...
✓ Found 15 frequent edit patterns

✅ Training completed!
✓ Model saved to: /content/model1/improved_corrector.pkl

📊 Evaluating on Test Set
Processed 300/100 sentences...
Processed 200/100 sentences...
✅ Conservative Accuracy: 85.00%
Processed 300/100 sentences...
Processed 200/100 sentences...
✅ Aggressive Accuracy: 85.00%

🏁 Final Accuracy: 85.00%

✅ Model Training Finished!
📊 Final Training Accuracy: 85.00%

📊 Evaluating Model Performance...
Processed 300/100 sentences...
Processed 200/100 sentences...

📈 Model Evaluation Metrics:
────────────────────────────
✅ Accuracy : 85.00% (85/100)
🎯 Precision: 1.00
📥 Recall   : 1.00
💡 F