<a href="https://colab.research.google.com/github/raj-027/Sanskrit-NLP/blob/main/Phonological_Edit_Distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Computing edit distance

In [13]:
import torch

In [14]:
sanskrit_vowels = {
    "अ": {"ipa": "ɐ", "height": "near-open", "backness": "central", "rounded": False, "length": "short", "diphthong": False},
    "आ": {"ipa": "ɐː", "height": "near-open", "backness": "central", "rounded": False, "length": "long", "diphthong": False},
    "इ": {"ipa": "i", "height": "close", "backness": "front", "rounded": False, "length": "short", "diphthong": False},
    "ई": {"ipa": "iː", "height": "close", "backness": "front", "rounded": False, "length": "long", "diphthong": False},
    "उ": {"ipa": "u", "height": "close", "backness": "back", "rounded": True, "length": "short", "diphthong": False},
    "ऊ": {"ipa": "uː", "height": "close", "backness": "back", "rounded": True, "length": "long", "diphthong": False},
    "ऋ": {"ipa": "r̻ɘ", "height": "close-mid", "backness": "central", "rounded": False, "length": "short", "diphthong": False},
    "ॠ": {"ipa": "r̻ɘː", "height": "close-mid", "backness": "central", "rounded": False, "length": "long", "diphthong": False},
    "ऌ": {"ipa": "l̻ɘ", "height": "close-mid", "backness": "central", "rounded": False, "length": "short", "diphthong": False},
    "ॡ": {"ipa": "l̻ɘː", "height": "close-mid", "backness": "central", "rounded": False, "length": "long", "diphthong": False},
    "ए": {"ipa": "e̞ː", "height": "mid", "backness": "front", "rounded": False, "length": "long", "diphthong": False},
    "ऐ": {"ipa": "ɐ͡iː", "height": "diphthong", "backness": "central-front", "rounded": False, "length": "long", "diphthong": True},
    "ओ": {"ipa": "o̞ː", "height": "mid", "backness": "back", "rounded": True, "length": "long", "diphthong": False},
    "औ": {"ipa": "ɐ͡uː", "height": "diphthong", "backness": "central-back", "rounded": True, "length": "long", "diphthong": True},
    "ः": {"ipa": "h", "height": None, "backness": None, "rounded": None, "length": None, "diphthong": False, "special": "visarga"},
    "ं": {"ipa": "̃", "height": None, "backness": None, "rounded": None, "length": None, "diphthong": False, "special": "anusvara"}
}

# ---- 2. Full Sanskrit consonant inventory (classical) ----
sanskrit_consonants = {
    # Velars
    "क": {"ipa": "k", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "velar"},
    "ख": {"ipa": "kʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "velar"},
    "ग": {"ipa": "g", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "velar"},
    "घ": {"ipa": "gɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "velar"},
    "ङ": {"ipa": "ŋ", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "velar"},
    # Palatals / alveolopalatal (cavarga)
    "च": {"ipa": "t͡ɕ", "manner": "affricate", "voiced": False, "aspirated": False, "passive": "alveolopalatal"},
    "छ": {"ipa": "t͡ɕʰ", "manner": "affricate", "voiced": False, "aspirated": True, "passive": "alveolopalatal"},
    "ज": {"ipa": "d͡ʑ", "manner": "affricate", "voiced": True, "aspirated": False, "passive": "alveolopalatal"},
    "झ": {"ipa": "d͡ʑɦ", "manner": "affricate", "voiced": True, "aspirated": True, "passive": "alveolopalatal"},
    "ञ": {"ipa": "ɲ", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "palatal"},
    # Retroflex
    "ट": {"ipa": "ʈ", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "retroflex"},
    "ठ": {"ipa": "ʈʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "retroflex"},
    "ड": {"ipa": "ɖ", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "retroflex"},
    "ढ": {"ipa": "ɖɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "retroflex"},
    "ण": {"ipa": "ɳ", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "retroflex"},
    "ळ": {"ipa": "ɭ̆", "manner": "lateral_flap", "voiced": True, "aspirated": False, "passive": "retroflex"},
    # Dentals (laminal dental series)
    "त": {"ipa": "t̪", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "dental"},
    "थ": {"ipa": "t̪ʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "dental"},
    "द": {"ipa": "d̪", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "dental"},
    "ध": {"ipa": "d̪ɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "dental"},
    "न": {"ipa": "n̪", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "dental"},
    # Labials
    "प": {"ipa": "p", "manner": "plosive", "voiced": False, "aspirated": False, "passive": "bilabial"},
    "फ": {"ipa": "pʰ", "manner": "plosive", "voiced": False, "aspirated": True, "passive": "bilabial"},
    "ब": {"ipa": "b", "manner": "plosive", "voiced": True, "aspirated": False, "passive": "bilabial"},
    "भ": {"ipa": "bɦ", "manner": "plosive", "voiced": True, "aspirated": True, "passive": "bilabial"},
    "म": {"ipa": "m", "manner": "nasal", "voiced": True, "aspirated": False, "passive": "bilabial"},
    # Approximants / semivowels / liquids
    "य": {"ipa": "j", "manner": "approximant", "voiced": True, "aspirated": False, "passive": "palatal"},
    "र": {"ipa": "ɾ̻", "manner": "flap", "voiced": True, "aspirated": False, "passive": "alveolar"},
    "ल": {"ipa": "l̪", "manner": "lateral_approximant", "voiced": True, "aspirated": False, "passive": "dental"},
    "व": {"ipa": "ʋ", "manner": "approximant", "voiced": True, "aspirated": False, "passive": "labiodental"},
    # Sibilants and fricatives
    "श": {"ipa": "ɕ", "manner": "fricative", "voiced": False, "aspirated": False, "passive": "palatal"},
    "ष": {"ipa": "ʂ", "manner": "fricative", "voiced": False, "aspirated": False, "passive": "retroflex"},
    "स": {"ipa": "s̪", "manner": "fricative", "voiced": False, "aspirated": False, "passive": "dental"},
    "ह": {"ipa": "ɦ", "manner": "fricative", "voiced": True, "aspirated": False, "passive": "glottal"},
    # Marginal affricate symbols (mapped to Devanagari approximations if needed)
    "च̣": {"ipa": "t͡ʃ", "manner": "affricate", "voiced": False, "aspirated": False, "passive": "alveolopalatal"},  # variant
    "ज̣": {"ipa": "d͡ʒ", "manner": "affricate", "voiced": True, "aspirated": False, "passive": "alveolopalatal"}   # variant
}

In [15]:
unified_features = [
    # Core phonation/aspiration/nasality (binary opposites)
    "voiced", "aspirated", "nasal",

    # Manner (consonants) - one-hot
    "plosive", "fricative", "affricate", "approximant", "trill", "flap",
    "lateral_approximant", "lateral_flap",

    # Place of articulation (consonants) - one-hot
    "bilabial", "labiodental", "dental", "alveolar", "retroflex",
    "alveolopalatal", "palatal", "velar", "post_velar", "glottal",



    # Vowel height - binary opposites
    "high", "mid", "low", "close_mid", "near_open",

    # Vowel backness - binary opposites
    "front", "back", "central", "central_front", "central_back",

    # Vowel rounding
    "rounded",

    # Vowel length
    "long",

    # Diphthong
    "diphthong"
]


In [16]:
def phoneme_to_unified(ch):
    vec = []

    if ch in sanskrit_consonants:
        feats = sanskrit_consonants[ch]

        # Binary opposites
        vec.append(+1 if feats.get("voiced") else -1)
        vec.append(+1 if feats.get("aspirated") else -1)
        vec.append(+1 if feats.get("manner") == "nasal" else -1)

        # Manner one-hot
        manner_list = ["plosive", "fricative", "affricate", "approximant", "trill", "flap",
                       "lateral_approximant", "lateral_flap"]
        for m in manner_list:
            vec.append(+1 if feats.get("manner") == m else 0)

        # Place one-hot
        place_list = ["bilabial", "labiodental", "dental", "alveolar", "retroflex",
                      "alveolopalatal", "palatal", "velar", "post_velar", "glottal"]
        for p in place_list:
            vec.append(+1 if feats.get("passive") == p else 0)

        # Vowel features irrelevant here → 0s
        vec.extend([0] * (len(unified_features) - len(vec)))
         # --- Vowel features irrelevant for consonants ---> 14
         # high, mid, low, close_mid, near_open, front, back, central, central_front, central_back, rounded, long, diphthong


    elif ch in sanskrit_vowels:
        feats = sanskrit_vowels[ch]

        # Consonant binary opposites irrelevant
        vec.extend([0, 0, 0])

        # Consonant manners irrelevant
        vec.extend([0] * 8)

        # Consonant places irrelevant
        vec.extend([0] * 10)



        # Vowel height
         # --- Vowel height (coarse + fine) ---
        h = (feats.get("height") or "").lower()
        # Initialize all as irrelevant
        high = mid = low = close_mid = near_open = 0

        if h == "close":
            high, mid, low = +1, -1, -1
            close_mid, near_open = -1, -1
        elif h == "close-mid":
            # fine-grained: close_mid present; coarse: treat as mid-like
            high, mid, low = -1, +1, -1
            close_mid, near_open = +1, -1
        elif h == "mid":
            high, mid, low = -1, +1, -1
            close_mid, near_open = 0, 0
        elif h == "near-open":
            high, mid, low = -1, -1, +1
            close_mid, near_open = -1, +1
        elif h == "diphthong":
            # Sanskrit diphthongs start near [ɐ] → approximate as near-open
            high, mid, low = -1, -1, +1
            close_mid, near_open = -1, +1
        else:
            # Unknown/none → keep 0 (irrelevant)
            pass

        vec.extend([high, mid, low, close_mid, near_open])



        # Backness
       # --- Vowel backness (coarse + blended) ---
        b = (feats.get("backness") or "").lower()
        front = back = central = central_front = central_back = 0

        if "central" in b and "front" in b:
            # blended: central-front
            central, front, back = +1, +1, -1
            central_front, central_back = +1, -1
        elif "central" in b and "back" in b:
            # blended: central-back
            central, back, front = +1, +1, -1
            central_front, central_back = -1, +1
        elif "front" in b:
            front, back, central = +1, -1, -1
            central_front, central_back = 0, 0
        elif "back" in b:
            back, front, central = +1, -1, -1
            central_front, central_back = 0, 0
        elif "central" in b:
            central, front, back = +1, -1, -1
            central_front, central_back = 0, 0
        else:
            # unknown/none → irrelevant
            pass

        vec.extend([front, back, central, central_front, central_back])




        # Rounded
        if feats.get("rounded") is True:
            vec.append(+1)
        elif feats.get("rounded") is False:
            vec.append(-1)
        else:
            vec.append(0)

        # Long
        if feats.get("length") == "long":
            vec.append(+1)
        elif feats.get("length") == "short":
            vec.append(-1)
        else:
            vec.append(0)

        # Diphthong
        if feats.get("diphthong"):
            vec.append(+1)
        else:
            vec.append(-1)

    else:
        # Unknown phoneme → all zeros
        vec = [0] * len(unified_features)

    # Safety: pad if anything short (shouldn’t happen, but belt-and-suspenders)
    if len(vec) < len(unified_features):
        vec.extend([0] * (len(unified_features) - len(vec)))


    return vec


In [17]:
def word_to_unified_sequence(word):
    return torch.tensor([phoneme_to_unified(ch) for ch in word], dtype=torch.float32)


In [33]:
import numpy as np

def substitution_cost(vec1, vec2, normalized=True):
    """
    Compute phoneme substitution cost between two feature vectors.
    If normalized=True, returns value in [0,1].
    If normalized=False, returns raw L1 distance.
    """
    diff = np.abs(vec1 - vec2)
    if normalized:
        return np.sum(diff) / (2 * len(vec1))   # normalized L1
    else:
        return np.sum(diff)                     # raw L1


def phonological_edit_distance(word1, word2, normalized=True, gap_penalty=1.0):
    seq1 = word_to_unified_sequence(word1).numpy()
    seq2 = word_to_unified_sequence(word2).numpy()

    n, m = len(seq1), len(seq2)
    dp = np.zeros((n+1, m+1))

    for i in range(1, n+1):
        dp[i][0] = i * gap_penalty
    for j in range(1, m+1):
        dp[0][j] = j * gap_penalty

    for i in range(1, n+1):
        for j in range(1, m+1):
            #sub_cost = np.mean(np.abs(seq1[i-1] - seq2[j-1])) / 2  # normalised
            #sub_cost = np.sum(np.abs(seq1[i-1] - seq2[j-1]))  # unnormalized L1
            sub_cost = substitution_cost(seq1[i-1], seq2[j-1], normalized)
            dp[i][j] = min(
                dp[i-1][j] + gap_penalty,       # deletion
                dp[i][j-1] + gap_penalty,       # insertion
                dp[i-1][j-1] + sub_cost         # substitution
            )

    return dp[n][m]


In [35]:
# Normalized cost (∈ [0,1]) — recommended for cross-word comparability
print(phonological_edit_distance("सार्थम्", "ऋग्वेदीय", normalized=True))

# Raw L1 cost (unnormalized, depends on feature dimensionality)
print(phonological_edit_distance("सार्थम्", "ऋग्वेदीय", normalized=False))


1.3382353000342846
9.0


In [31]:
def print_word_vectors(word):
    seq = word_to_unified_sequence(word).numpy()
    print(f"\nWord: {word}")
    for i, vec in enumerate(seq):
        print(f"{word[i]} -> {vec.tolist()}")

def phonological_edit_distance_debug(word1, word2):
    print_word_vectors(word1)
    print_word_vectors(word2)

    dist = phonological_edit_distance(word1, word2)
    print(f"\nPhonological edit distance ({word1} vs {word2}): {dist}")




In [32]:
phonological_edit_distance_debug(word1="सार्थम्", word2="ऋग्वेदीय")
# नासदीय सूक्तम् ऋग्वेदीय सार्थम्


Word: सार्थम्
स -> [-1.0, -1.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
ा -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
र -> [1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
् -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
थ -> [-1.0, 1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
म -> [1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0