# Dictionary Generation (Eg: Sanskrit - German)

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!mkdir dic_xml
!mkdir dic_html

# Make sure that you have the uploaded the dictionary xml files. Adjust the below paths accordingly
!cp /content/drive/MyDrive/iith-assignments/word-dichtml-mapping.xml .
!cp /content/drive/MyDrive/iith-assignments/dic_xml/*.xml /content/dic_xml # Note: CCS is sourced from https://www.sanskrit-lexicon.uni-koeln.de/scans/CCSScan/2020/web/webtc/download.html
!pip install indic-transliteration
!pip3 install devtrans


In [None]:
import xml.etree.ElementTree as ET
import devtrans
from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate

from abc import ABC, abstractmethod

# Html Dictionary Generator interface
class DicHtmlGenerator(ABC):

   @abstractmethod
   def gen_html_from_xmldic(self):
       pass

   @abstractmethod
   def load_word_dichtml_mapping_file(self):
       pass

   @abstractmethod
   def wrapper_loop_xmldic_for_html_gen(self):
       pass

   @abstractmethod
   def zip_and_download_htmldic(self):
       pass

   @abstractmethod
   def clean_htmldic_folder(slef):
     pass

# Concrete class
class CcsDicHtmlGenerator(DicHtmlGenerator):

  def __init__(self, dic_xml_folder_path, dic_html_folder_path, word_dichtml_mapping_file):
    self.result={}
    # dic_xml_folder_path = "/content/dic_xml/"
    # dic_html_folder_path = "/content/dic_html/"
    self.dic_xml=dic_xml_folder_path
    self.dic_html=dic_html_folder_path
    self.word_dichtml_mapping_file = word_dichtml_mapping_file

  # 1. overriding abstract method
  def load_word_dichtml_mapping_file(self):
    tree = ET.parse(self.word_dichtml_mapping_file)
    root = tree.getroot()

    for item in root.findall('./item'):
      keyvel = item.find('keyvel').text
      keyskt = item.find('keyskt').text
      filename = item.find('filename').text
      self.result[keyvel] = filename

  # 2. cleanup the dic folders
  def clean_htmldic_folder(self):
    !rm /content/dic-html/*.html
    !rm /content/word-dichtml-mapping.xml

  def __read_between_delimiters(self, text, delimiter1, delimiter2):
    start = text.find(delimiter1)
    if start == -1:
        return None
    start += len(delimiter1)
    end = text.find(delimiter2, start)
    if end == -1:
        end = text.find("</body>", start)
        if end == -1:
          return None
    return text[start:end]

  # 3. parse xml and generate html
  def gen_html_from_xmldic(self, filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    for item in root.findall('./H1'):
      interlink = []
      interlink_without_conversion = []
      i = ''
      key1 = item.find('h').find('key1').text
      if (item.find('body').find('i') != None):
        i = item.find('body').find('i').text

      s = '-----'.join(item.find('body').itertext())
      bodyvalue = ET.tostring(item.find('body'), encoding='unicode')

      for bdy in item.findall('body/s'):
        key1vel = devtrans.wx2vel(bdy.text)
        interlink.append(key1vel)
        interlink_without_conversion.append(bdy.text)

      L = item.find('tail').find('L').text
      pc = item.find('tail').find('pc').text


      # key --> BaNgi
      # Itrans format -->  bhaNgi
      # Tex format --> bha\d{n}gi
      # hk format --> bhaNgi
      # slp format --> BaRgi
      # vel format --> bha.ngi
      # dev format -->  भङ्गि

      key1Iast = devtrans.wx2iast(key1)
      key1Itrans = devtrans.wx2itrans(key1)
      key1Tex = devtrans.wx2tex(key1)
      key1hk = devtrans.wx2hk(key1)
      key1slp = devtrans.wx2slp(key1)
      key1vel = devtrans.wx2vel(key1)
      key1dev = devtrans.slp2dev(key1, True)

      filename = filename.replace(".xml", ".html")
      filename = filename.replace("xml", "html")

      # print("****************************" + filename + "****************************")

      # url = check_if_key_present_in_dic(key1vel)
      pTagStart = '<p>'
      spanTag1 = '<span class="Deva">'+ key1  + '</span>'
      iTag = '[<i><a class="Blue">' + key1Iast + '</a></i>]'
      iTag1 = ""
      str = ""
      for index, lst in enumerate(interlink):
        if lst in self.result:
          str = self.__read_between_delimiters(bodyvalue, "<s>" + interlink_without_conversion[index], "<s>")
          iTag1 = iTag1 + '[<i><a class="Blue"' + "href=" + '"' + self.result[lst] + '#' + lst + '">' + lst + '</a></i>]' + str.replace("</s>", "")
        else:
          str = self.__read_between_delimiters(bodyvalue, "<s>" + interlink_without_conversion[index], "<s>")
          if str.strip():
            iTag1 = iTag1 + '[<i>' + lst + '</i>]' + " " + str.replace("</s>", "")

      pTagEnd = '</p>'

      aTag1 = "<a name=" + "'" + key1vel + "'></a>"
      aTag2 = "<a name=" + "'" + key1vel + "_pr'></a>"

      finalStr = pTagStart + " " + key1dev  + " " + aTag1  + " " + aTag1  + " " + spanTag1  + " " + i  + " " + iTag  + " " + iTag1  + pTagEnd

      # https://sanskrit.uohyd.ac.in/cgi-bin/SKT/sktdeclin?q=raama;g=Mas;font=roma
      sktdeclin = '<i><a class="Red" href="/cgi-bin/SKT/sktdeclin?q=' + key1.lower() + '&amp;g=Mas;font=roma">m.</a></i>'
      finalStr = finalStr.replace("<i>m.</i>", sktdeclin)
      finalStr = finalStr.replace("<i>m</i>", sktdeclin)
      sktdeclin = '<i><a class="Red" href="/cgi-bin/SKT/sktdeclin?q=' + key1.lower() + '&amp;g=Fem;font=roma">f.</a></i>'
      finalStr = finalStr.replace("<i>f.</i>", sktdeclin)
      finalStr = finalStr.replace("<i>f</i>", sktdeclin)
      sktdeclin = '<i><a class="Red" href="/cgi-bin/SKT/sktdeclin?q=' + key1.lower() + '&amp;g=Neu;font=roma">n.</a></i>'
      finalStr = finalStr.replace("<i>n.</i>", sktdeclin)
      finalStr = finalStr.replace("<i>n</i>", sktdeclin)


      with open(filename, 'a+') as f:
        f.write(finalStr)

      with open(self.word_dichtml_mapping_file, 'a+') as f:
        f.write("<item>" + "<key>" + key1 + "</key>" + "<keyvel>" + key1vel + "</keyvel>" + "<keyskt>" + key1dev + "</keyskt>" + "<filename>" + filename.split('/')[-1] + "</filename>" + "</item>")



  # 4. Wrapper - loop through the dic (xml) files and call html gen method. 
  def wrapper_loop_xmldic_for_html_gen(self):
    import os

    # Set the dictionary directory path
    directory = self.dic_xml

    # Loop through each file in the directory
    for filename in os.listdir(self.dic_xml):
        # Check if the file is a regular file
        if (os.path.isfile(os.path.join(directory, filename))):
            self.gen_html_from_xmldic(directory + filename)
  

  # 5. zip and download the final html dictionary
  def zip_and_download_htmldic(self):
    zipfile = "dic_html.zip"
    htmlfiles = self.dic_html + "*.html"
    !zip $zipfile $htmlfiles
    from google.colab import files
    files.download(zipfile)



CcsDic=CcsDicHtmlGenerator("/content/dic_xml/", "/content/dic_html/", "/content/word-dichtml-mapping.xml")
CcsDic.load_word_dichtml_mapping_file()
CcsDic.clean_htmldic_folder()
CcsDic.wrapper_loop_xmldic_for_html_gen()
CcsDic.zip_and_download_htmldic()



In [None]:
def create_dict():
  # Load the XML file
  tree = ET.parse('/content/word-dichtml-mapping.xml')
  root = tree.getroot()

  # Create an empty dictionary
  result = {}

  for item in root.findall('./item'):
    keyvel = item.find('keyvel').text
    keyskt = item.find('keyskt').text
    filename = item.find('filename').text
    result[keyvel] = filename

  # Print the dictionary
  print(result)

# create_dict()