In [1]:
import json
import re
import pandas as pd
import sqlite3 as SQL
import requests

from indic_transliteration.sanscript import transliterate, IAST, DEVANAGARI, HK

In [2]:
df = pd.read_csv('../data/eng-hin/eng-hin-tabfile.txt', sep='\t')

In [3]:
df.head(10)

Unnamed: 0,##name,eng-hin.index
0,##sourceLang,English
1,##targetLang,Hindi
2,00databasealphabet,12abcdefghijklmnopqrstuvwxyz<br />
3,00databasedictfmt1121,00-database-dictfmt-1.12.1<br />
4,00databaseinfo,English-Hindi FreeDict Dictionary<br /><br />M...
5,00databaseshort,English-Hindi FreeDict Dictionary ver. 1.6<br />
6,00databaseurl,unknown<br />
7,00databaseutf8,<br />
8,a,"a /ˈeɪ/ <Det><br />1. एक<br /> ""I bought ..."
9,a feather in ones cap,A feather in one's cap /ɐ fˈɛðəɹ ɪn wˈɒnz kˈap...


In [4]:
dict_df = df[8:].copy()
dict_df.rename({'##name': 'word/phrase', 'eng-hin.index': 'definition'}, axis=1, inplace=True)

In [5]:
def romanized_to_devanagari(word):
    """
    Converts a Romanized Hindi word into Devanagari.
    :param word: Romanized Hindi word (string)
    :return: Devanagari script equivalent (string)
    """
    return transliterate(word, IAST, DEVANAGARI)


def devanagari_to_romanized(word):
    """
    Converts a Devanagari Hindi word into Romanized Hindi.
    :param word: Devanagari script equivalent (string)
    :return: Romanized Hindi word (string)
    """
    return transliterate(word, DEVANAGARI, IAST)

def devanagari_to_romanized_hk(word):
    """
    Converts a Devanagari Hindi word into Romanized Hindi.
    :param word: Devanagari script equivalent (string)
    :return: Romanized Hindi word (string)
    """
    return transliterate(word, DEVANAGARI, HK)


def parse_definition(definition_text):
    # Split input by parts of speech based on <N> and <VT> markers
    # parts = re.split(r'<([NVIT]+)>', definition_text)
    parts = re.split(r'<([NVVIVTAdjAdvPronPrepConjDetNumInterjIDMPhrvAbbrSlExclAuxPlArt]+)>', definition_text)

    # Extract the word and pronunciation
    word_section = parts[0].strip()
    # pattern_word_section = r"^([\w\s]+)\s/([^/]+)/"
    # pattern_word_section = r"^([\w\s]+)\s/([ˈˌaæbddʒefghijklmnŋoɔprstt̬tʃuvwxɐɑɒɑ̃ɚəɜɝɛɪʃʊʌʒθð\s]+)/"
    # pattern_word_section = r"([\w\d\s'.]+)\/([aäɑɒæbḇβcčɔɕçdḏḍðeəɚɛɝfgḡhʰḥḫẖiɪỉɨjʲǰkḳḵlḷɬɫmnŋṇɲɴoŏɸθpp̅þqrɹɾʀʁṛsšśṣʃtṭṯʨtʂuʊŭüvʌɣwʍxχyʸʎzẓžʒ’‘ʔʕ\s]+)\/"
    pattern_word_section = r"([\w\d\s'.\-()~]+)\/(.+)\/"
    word, pronunciation = re.match(
        pattern_word_section, word_section).groups()

    word = word.strip()

    # Initialize result structure
    result = {
        "word": word,
        "pronunciation": pronunciation,
        "entries": []
    }

    # Process each POS section
    for i in range(1, len(parts) - 1, 2):
        part_of_speech = parts[i].strip()
        raw_definitions = parts[i + 1].strip()

        # Map POS markers to human-readable forms
        pos_map = {
            "N": "Noun",
            "V": "Verb",
            "VI": "Verb (Intransitive)",
            "VT": "Verb (Transitive)",
            "Adj": "Adjective",
            "Adv": "Adverb",
            "Pron": "Pronoun",
            "Prep": "Preposition",
            "Conj": "Conjunction",
            "Det": "Determiner",
            "Num": "Numeral",
            "Interj": "Interjection",
            "IDM": "Idiom",
            "Phrv": "Phrasal Verb",
            "Abbr": "Abbreviation",
            "Sl": "Slang",
            "Excl": "Exclamation",
            "Aux": "Auxiliary Verb",
            "Pl": "Plural",
            "Art": "Article"
        }
        pos_name = pos_map.get(part_of_speech, part_of_speech)

        # Parse definitions
        definitions = []
        for entry in re.split(r'\d+\.\s', raw_definitions)[1:]:
            match = re.match(r'([^<]+)<br\s*/>\s*"([^"]+)"', entry, re.DOTALL)
            if match:
                hindi, example = match.groups()
                hindi_word = hindi.strip().replace('~', ' ')
                definitions.append({
                    "hindi": hindi_word,
                    "romanized_iast": devanagari_to_romanized(hindi_word),
                    "romanized_hk": devanagari_to_romanized_hk(hindi_word),
                    "example": example.strip()
                })

        # Add parsed data to the entries
        result["entries"].append({
            "part_of_speech": pos_name,
            "definitions": definitions
        })

    return result

In [6]:
all_words = {}

word = ''
unparsed_words = []
for row in dict_df.itertuples():
    index = row[0]
    word = row[1]
    data = row[2]
    try:
        parsed_data = parse_definition(data)
        all_words[word] = parsed_data
    except Exception as e:
        # print(e)
        parts = re.split(r'<([NVVIVTAdjAdvPronPrepConjDetNumInterjIDMPhrvAbbrSlExclAuxPlArt]+)>', data)
        word_section = parts[0].strip()
        print(word_section)
        unparsed_words.append(word)

with open('../data/eng-hin/eng-hin-parsed.json', 'w') as f:
    f.write(json.dumps(all_words))
    print(len(list(all_words.keys())))

with open('../data/eng-hin/eng-hin-unparsed.json', 'w') as f:
    f.write(json.dumps(unparsed_words))
    print(len(unparsed_words))

flag[stone] /flˈaɡ stˈəʊn/
keep in with <a href="bword://infl">infl</a> /kˈiːp ɪn wɪð ˈɪnfəl/
not care\\give a fig /nˌɒt kˈeə bˈakslaʃ ɡˈɪv ɐ fˈɪɡ/
pell-?????? /pˈɛl/
raise (US=rise) /ɹˈeɪz jˌuːˈɛs ˈiːkwəlz ɹˈaɪz/
tu-whit,tu-whoo /tˈuːwˈɪt tˈuːwˈuː/
wellington[boot] /wˈɛlɪŋtən bˈuːt/
22864
7


In [7]:
print("dak\u1e63i\u1e47\u012b aphr\u012bk\u0101 k\u012b janaj\u0101t\u012b")

dakṣiṇī aphrīkā kī janajātī


In [8]:
def convert_to_hindi_to_english_dict(english_to_hindi_dict):
    hindi_to_english_dict = {}
    hindi_to_english_dict_hk = {}

    for entry in english_to_hindi_dict.values():
        word = entry["word"]
        for pos_entry in entry["entries"]:
            for definition in pos_entry["definitions"]:
                # Add Hindi words as keys
                for hindi_word in definition["hindi"].split(", "):  # Split multiple Hindi meanings
                    hindi_word = hindi_word.replace('~', '')
                    if hindi_word not in hindi_to_english_dict:
                        hindi_to_english_dict[hindi_word] = []
                    hindi_to_english_dict[hindi_word].append(word)

                # Add Romanized Harvard-Kyoto words as keys
                for romanized_word in definition["romanized_hk"].split(", "):  # Split multiple romanized words
                    romanized_word = romanized_word.replace('~', '')
                    if romanized_word not in hindi_to_english_dict_hk:
                        hindi_to_english_dict_hk[romanized_word] = []
                    hindi_to_english_dict_hk[romanized_word].append(word)

    return hindi_to_english_dict, hindi_to_english_dict_hk


# Convert to Hindi-to-English dictionary
hindi_to_english_dict, hindi_to_english_dict_hk = convert_to_hindi_to_english_dict(all_words)

# store the result
with open('../data/eng-hin/hi-en-mapping.json', 'w') as f:
    f.write(json.dumps(hindi_to_english_dict))
    
with open('../data/eng-hin/hi-en-hk-mapping.json', 'w') as f:
    f.write(json.dumps(hindi_to_english_dict_hk))

In [9]:
documents = []

hindi_to_english_dict = None
with open('../data/eng-hin/hi-en-mapping.json', 'r') as f:
    hindi_to_english_dict = json.load(f)

hindi_to_english_dict_hk = None
with open('../data/eng-hin/hi-en-hk-mapping.json', 'r') as f:
    hindi_to_english_dict_hk = json.load(f)

print(list(hindi_to_english_dict_hk.keys())[:5])

['eka', 'kaThina kArya ko karanA', 'kucha', 'nirNAyaka lar3AI', 'kArya kI saphalatA kI thor3I AzA']


In [None]:
conn = SQL.connect("../data/english-dictionary-cloudbytes.db")

def get_definition(word):
    try:
        db = conn.cursor()
        query = "SELECT * from entries where word='{}'".format(word)
        db.execute(query)
        output = db.fetchall()
        db.close()
        return [word, [o[1] + " " + o[2] for o in output if o[1]]]
    except Exception as e:
        print(query)
        print(e)


def get_definition_api(word):
    url = 'https://api.dictionaryapi.dev/api/v2/entries/en/' + word
    response = requests.get(url)
    if response.status_code == 200:
        resp_list = response.json()
        resp_dict = resp_list[0]
        return [m['partOfSpeech'] + " " + d['definition'] for m in resp_dict['meanings'] for d in m['definitions']]


def prep_documents(english_to_hindi_dict):
    documents = []
    for hindi_word, english_words in english_to_hindi_dict.items():
        for w in english_words:
            if len(w.split()) == 1:
                definition = get_definition(w)
                if definition:
                    text = hindi_word + " | " + w + " | " + "\n" + "\n".join(definition[1])
                    documents.append(text)
                else:
                    definition = get_definition_api(w)
                    if definition:
                        text = hindi_word + " | " + w + " | " + "\n" + "\n".join(definition)
                        documents.append(text)
    return documents

documents = prep_documents(hindi_to_english_dict_hk)

with open('../data/eng-hin/hi-en-documents.json', 'w') as f:
    f.write(json.dumps({'documents': documents}))

SELECT * from entries where word='hair's-breadth'
near "s": syntax error
SELECT * from entries where word='hors-d'oeuvre'
near "oeuvre": syntax error
SELECT * from entries where word='knacker's-yard'
near "s": syntax error
SELECT * from entries where word='knacker's-yard'
near "s": syntax error
SELECT * from entries where word='ma'am'
near "am": syntax error
SELECT * from entries where word='ma'am'
near "am": syntax error
SELECT * from entries where word='o'clock'
near "clock": syntax error
SELECT * from entries where word='outre''
unrecognized token: "'outre''"
SELECT * from entries where word='piracy'
can only concatenate str (not "NoneType") to str
SELECT * from entries where word='piracy'
can only concatenate str (not "NoneType") to str
SELECT * from entries where word='rapid'transit''
near "transit": syntax error
