<a href="https://colab.research.google.com/github/raj-027/Sanskrit-NLP/blob/main/Phonological_Edit_Distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Computing edit distance

In [8]:
import torch

In [9]:
sanskrit_vowels = {
    "अ": {"ipa": "ɐ", "height": "near-open", "backness": "central", "rounded": False, "length": "short", "diphthong": False},
    "आ": {"ipa": "ɐː", "height": "near-open", "backness": "central", "rounded": False, "length": "long", "diphthong": False},
    "इ": {"ipa": "i", "height": "close", "backness": "front", "rounded": False, "length": "short", "diphthong": False},
    "ई": {"ipa": "iː", "height": "close", "backness": "front", "rounded": False, "length": "long", "diphthong": False},
    "उ": {"ipa": "u", "height": "close", "backness": "back", "rounded": True, "length": "short", "diphthong": False},
    "ऊ": {"ipa": "uː", "height": "close", "backness": "back", "rounded": True, "length": "long", "diphthong": False},
    "ऋ": {"ipa": "r̻ɘ", "height": "close-mid", "backness": "central", "rounded": False, "length": "short", "diphthong": False},
    "ॠ": {"ipa": "r̻ɘː", "height": "close-mid", "backness": "central", "rounded": False, "length": "long", "diphthong": False},
    "ऌ": {"ipa": "l̻ɘ", "height": "close-mid", "backness": "central", "rounded": False, "length": "short", "diphthong": False},
    "ॡ": {"ipa": "l̻ɘː", "height": "close-mid", "backness": "central", "rounded": False, "length": "long", "diphthong": False},
    "ए": {"ipa": "e̞ː", "height": "mid", "backness": "front", "rounded": False, "length": "long", "diphthong": False},
    "ऐ": {"ipa": "ɐ͡iː", "height": "diphthong", "backness": "central-front", "rounded": False, "length": "long", "diphthong": True},
    "ओ": {"ipa": "o̞ː", "height": "mid", "backness": "back", "rounded": True, "length": "long", "diphthong": False},
    "औ": {"ipa": "ɐ͡uː", "height": "diphthong", "backness": "central-back", "rounded": True, "length": "long", "diphthong": True},
    "ः": {"ipa": "h", "height": None, "backness": None, "rounded": None, "length": None, "diphthong": False, "special": "visarga"},
    "ं": {"ipa": "̃", "height": None, "backness": None, "rounded": None, "length": None, "diphthong": False, "special": "anusvara"}
}

# ---- 2. Full Sanskrit consonant inventory (classical) ----
sanskrit_consonants = {
    # Velars
    "क": {"ipa": "k", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "velar"},
    "ख": {"ipa": "kʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "velar"},
    "ग": {"ipa": "g", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "velar"},
    "घ": {"ipa": "gɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "velar"},
    "ङ": {"ipa": "ŋ", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "velar"},
    # Palatals / alveolopalatal (cavarga)
    "च": {"ipa": "t͡ɕ", "manner": "affricate", "voiced": False, "aspirated": False, "passive": "alveolopalatal"},
    "छ": {"ipa": "t͡ɕʰ", "manner": "affricate", "voiced": False, "aspirated": True, "passive": "alveolopalatal"},
    "ज": {"ipa": "d͡ʑ", "manner": "affricate", "voiced": True, "aspirated": False, "passive": "alveolopalatal"},
    "झ": {"ipa": "d͡ʑɦ", "manner": "affricate", "voiced": True, "aspirated": True, "passive": "alveolopalatal"},
    "ञ": {"ipa": "ɲ", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "palatal"},
    # Retroflex
    "ट": {"ipa": "ʈ", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "retroflex"},
    "ठ": {"ipa": "ʈʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "retroflex"},
    "ड": {"ipa": "ɖ", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "retroflex"},
    "ढ": {"ipa": "ɖɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "retroflex"},
    "ण": {"ipa": "ɳ", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "retroflex"},
    "ळ": {"ipa": "ɭ̆", "manner": "lateral_flap", "voiced": True, "aspirated": False, "passive": "retroflex"},
    # Dentals (laminal dental series)
    "त": {"ipa": "t̪", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "dental"},
    "थ": {"ipa": "t̪ʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "dental"},
    "द": {"ipa": "d̪", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "dental"},
    "ध": {"ipa": "d̪ɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "dental"},
    "न": {"ipa": "n̪", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "dental"},
    # Labials
    "प": {"ipa": "p", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "bilabial"},
    "फ": {"ipa": "pʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "bilabial"},
    "ब": {"ipa": "b", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "bilabial"},
    "भ": {"ipa": "bɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "bilabial"},
    "म": {"ipa": "m", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "bilabial"},
    # Approximants / semivowels / liquids
    "य": {"ipa": "j", "manner": "approximant", "voiced": True, "aspirated": False, "passive": "palatal"},
    "र": {"ipa": "ɾ̻", "manner": "flap", "voiced": True, "aspirated": False, "passive": "alveolar"},
    "ल": {"ipa": "l̪", "manner": "lateral_approximant", "voiced": True, "aspirated": False, "passive": "dental"},
    "व": {"ipa": "ʋ", "manner": "approximant", "voiced": True, "aspirated": False, "passive": "labiodental"},
    # Sibilants and fricatives
    "श": {"ipa": "ɕ", "manner": "fricative", "voiced": False, "aspirated": False, "passive": "palatal"},
    "ष": {"ipa": "ʂ", "manner": "fricative", "voiced": False, "aspirated": False, "passive": "retroflex"},
    "स": {"ipa": "s̪", "manner": "fricative", "voiced": False, "aspirated": False, "passive": "dental"},
    "ह": {"ipa": "ɦ", "manner": "fricative", "voiced": True, "aspirated": False, "passive": "glottal"},
    # Marginal affricate symbols (mapped to Devanagari approximations if needed)
    "च̣": {"ipa": "t͡ʃ", "manner": "affricate", "voiced": False, "aspirated": False, "passive": "alveolopalatal"},  # variant
    "ज̣": {"ipa": "d͡ʒ", "manner": "affricate", "voiced": True, "aspirated": False, "passive": "alveolopalatal"}   # variant
}

In [10]:
# Unified PanPhon-style feature inventory
unified_features = [
    # Core phonation/aspiration/nasality (binary opposites)
    "voiced", "aspirated", "nasal",

    # Manner (consonants) - one-hot
    "plosive", "fricative", "affricate", "approximant", "trill", "flap",
    "lateral_approximant", "lateral_flap",

    # Place of articulation (consonants) - one-hot
    "bilabial", "labiodental", "dental", "alveolar", "retroflex",
    "alveolopalatal", "palatal", "velar", "post_velar", "glottal",

    # Vowel height - binary opposites
    "high", "mid", "low",

    # Vowel backness - binary opposites
    "front", "back", "central",

    # Vowel rounding
    "rounded",

    # Vowel length
    "long",

    # Diphthong
    "diphthong"
]


In [11]:
def phoneme_to_unified(ch):
    vec = []

    if ch in sanskrit_consonants:
        feats = sanskrit_consonants[ch]

        # Binary opposites
        vec.append(+1 if feats.get("voiced") else -1)
        vec.append(+1 if feats.get("aspirated") else -1)
        vec.append(+1 if feats.get("manner") == "nasal" else -1)

        # Manner one-hot
        manner_list = ["plosive", "fricative", "affricate", "approximant", "trill", "flap",
                       "lateral_approximant", "lateral_flap"]
        for m in manner_list:
            vec.append(+1 if feats.get("manner") == m else 0)

        # Place one-hot
        place_list = ["bilabial", "labiodental", "dental", "alveolar", "retroflex",
                      "alveolopalatal", "palatal", "velar", "post_velar", "glottal"]
        for p in place_list:
            vec.append(+1 if feats.get("passive") == p else 0)

        # Vowel features irrelevant here → 0s
        vec.extend([0] * (len(unified_features) - len(vec)))

    elif ch in sanskrit_vowels:
        feats = sanskrit_vowels[ch]

        # Consonant binary opposites irrelevant
        vec.extend([0, 0, 0])

        # Consonant manners irrelevant
        vec.extend([0] * 8)

        # Consonant places irrelevant
        vec.extend([0] * 10)

        # Vowel height
        height = feats.get("height")
        vec.append(+1 if height == "close" else -1 if height in ["mid", "near-open"] else 0)  # high
        vec.append(+1 if height in ["mid", "close-mid"] else -1 if height == "close" else 0)  # mid
        vec.append(+1 if height == "near-open" else -1 if height in ["close", "mid"] else 0)  # low

        # Backness
        backness = feats.get("backness") or ""
        vec.append(+1 if "front" in backness else -1 if "back" in backness else 0)   # front
        vec.append(+1 if "back" in backness else -1 if "front" in backness else 0)   # back
        vec.append(+1 if "central" in backness else -1 if any(x in backness for x in ["front", "back"]) else 0)  # central

        # Rounded
        if feats.get("rounded") is True:
            vec.append(+1)
        elif feats.get("rounded") is False:
            vec.append(-1)
        else:
            vec.append(0)

        # Long
        if feats.get("length") == "long":
            vec.append(+1)
        elif feats.get("length") == "short":
            vec.append(-1)
        else:
            vec.append(0)

        # Diphthong
        if feats.get("diphthong"):
            vec.append(+1)
        else:
            vec.append(-1)

    else:
        # Unknown phoneme → all zeros
        vec = [0] * len(unified_features)

    return vec


In [12]:
# converting word to unified sequence
def word_to_unified_sequence(word):
    return torch.tensor([phoneme_to_unified(ch) for ch in word], dtype=torch.float32)


In [16]:
import numpy as np

def phonological_edit_distance(word1, word2):
    seq1 = word_to_unified_sequence(word1).numpy()
    seq2 = word_to_unified_sequence(word2).numpy()

    n, m = len(seq1), len(seq2)
    dp = np.zeros((n+1, m+1))

    # Gap penalties — insertion or deletion cost
    g = 1.0

    for i in range(1, n+1):
        dp[i][0] = i * g
    for j in range(1, m+1):
        dp[0][j] = j * g

    for i in range(1, n+1):
        for j in range(1, m+1):
            sub_cost = np.mean(np.abs(seq1[i-1] - seq2[j-1])) / 2  # normalised
            dp[i][j] = min(
                dp[i-1][j] + g,       # deletion
                dp[i][j-1] + g,       # insertion
                dp[i-1][j-1] + sub_cost         # substitution
            )

    return dp[n][m]


In [17]:
def print_word_vectors(word):
    seq = word_to_unified_sequence(word).numpy()
    print(f"\nWord: {word}")
    for i, vec in enumerate(seq):
        print(f"{word[i]} -> {vec.tolist()}")

def phonological_edit_distance_debug(word1, word2):
    print_word_vectors(word1)
    print_word_vectors(word2)

    dist = phonological_edit_distance(word1, word2)
    print(f"\nPhonological edit distance ({word1} vs {word2}): {dist}")




In [18]:
phonological_edit_distance_debug(word1="नासदीय", word2="ऋग्वेदीय")
# नासदीय सूक्तम् (ऋग्वेदीय सार्थम्)


Word: नासदीय
न -> [1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
ा -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
स -> [-1.0, -1.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
द -> [1.0, -1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
ी -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
य -> [1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

Word: ऋग्वेदीय
ऋ -> [0.0, 0.0, 0.0, 0.0, 