In [1]:
# notebook for settings generation

In [119]:
import urdubiometer as ubm
from graphtransliterator import GraphTransliterator
urduGT = GraphTransliterator.from_yaml_file("../../transliterate/urdu.yml")
scholarlyGT = GraphTransliterator.from_yaml_file("../../transliterate/transliteration.yml")
hindiGT = GraphTransliterator.from_yaml_file('../../transliterate/devanagari.yaml')

In [120]:
lines = '''bol kih lab aazaad hai;n tere
bol zabaa;n ab tak terii hai
teraa sutvaa;n jism hai teraa
bol kih jaa;n ab tak terii hai
dekh kih aahangar kii dukaa;n me;n
tund hai;n shole sur;x hai aahan
khulne lage quflo;n ke dahaane
phailaa har ik zanjiir kaa daaman
bol yah tho;raa vaqt bahut hai
jism-o-zabaa;n kii maut se pahle
bol kih sach zindah hai ab tak
bol jo kuchh kahnaa hai kah le'''.split('\n')

lines_en = '''Speak, for your lips are free
Speak, for your tongue is still yours
Your long-suffering body is yours
Speak, for your life is still your own
Speak, for in the blacksmith's shop
The flames are fierce, the iron red
The mouths of locks have begun to open
The skirt of every chain is outspread
Speak, this little time is enough
Before the death of the body and tongue
Speak, for truth is still alive
Speak, say what must be said
'''.split("\n")

lines_ur = [urduGT.transliterate(line) for line in lines]
lines_trans = [scholarlyGT.transliterate(line) for line in lines]
lines_hi = [hindiGT.transliterate(line) for line in lines]

In [121]:
def vocalize(line):
    def show_arab(token):
        replacements = [ ('a', '^a'),
                         ('i', '^i'),
                         ('u', '^u'),
                         ('ai', '^ai'),
                         ('au', '^au'),
                         ('uu', '^uu')]
        for a, b in replacements:
            if token == a:
                token = b
        return token
    return ''.join([show_arab(token) for token in urduGT.tokenize(line)])


In [122]:
scanner = ubm.DefaultScanner(meters_list=[{'id':0, 'name': "Faiz", "regex_pattern": "=(=|--)===(=|--)=="}])


  data = yaml.load(yaml_str)
  """)
  return yaml.load(f.read())
  schema = yaml.load(schema_str)
  """


In [123]:
scans = [scanner.scan(line, graph_details=True)[0] for line in lines]

{'lines': ['bol kih lab aazaad hai;n tere',
  'bol zabaa;n ab tak terii hai',
  'teraa sutvaa;n jism hai teraa',
  'bol kih jaa;n ab tak terii hai',
  'dekh kih aahangar kii dukaa;n me;n',
  'tund hai;n shole sur;x hai aahan',
  'khulne lage quflo;n ke dahaane',
  'phailaa har ik zanjiir kaa daaman',
  'bol yah tho;raa vaqt bahut hai',
  'jism-o-zabaa;n kii maut se pahle',
  'bol kih sach zindah hai ab tak',
  'bol jo kuchh kahnaa hai kah le'],
 'lines_ur': ['بول کہ لب آزاد ہیں تیرے',
  'بول زباں اب تک تیری ہے',
  'تیرا ستواں جسم ہے تیرا',
  'بول کہ جاں اب تک تیری ہے',
  'دیکھ کہ آہنگر کی دکاں میں',
  'تند ہیں شولے سرخ ہے آہن',
  'کھلنے لگے قفلوں کے دہانے',
  'پھیلا ہر اک زنجیر کا دامن',
  'بول یہ تھوڑا وقت بہت ہے',
  'جسم و زباں کی موت سے پہلے',
  'بول کہ سچ زندہ ہے اب تک',
  'بول جو کچھ کہنا ہے کہہ لے'],
 'lines_en': "Speak, for your lips are free\nSpeak, for your tongue is still yours\nYour long-suffering body is yours\nSpeak, for your life is still your own\nSpeak, for in the black

In [150]:
labels_of = {} # "1" -> verse 1

for scan_id, scan in enumerate(scans):
    labels = {'orig': [], 'en': [], 'ur': [], 'hi': []}
    for match in scan.matches:
        orig_tokens = ''.join(match.orig_tokens)
        labels['orig'].append(orig_tokens)
        labels['en'].append(scholarlyGT.transliterate(orig_tokens).replace(" ", "·"))
        labels['ur'].append(urduGT.transliterate(orig_tokens).replace(" ", "·"))
        labels['hi'].append(hindiGT.transliterate(orig_tokens).replace(" ", "·")   )
        
    labels_of[scan_id+1] = labels

In [152]:
settings = {
    'lines': lines,
    'lines_ur': lines_ur,
    'lines_en': lines_en,
    'lines_hi': lines_hi,
    'scans': {str(i+1):scan for i, scan in enumerate(scans)},
    'labels_of': labels_of
}

import json
with open("../settings.js", "w") as f:
    f.write("settings="+json.dumps(settings))
settings['labels_of']

{1: {'orig': [' bo',
   'l',
   ' kih',
   ' lab',
   ' aa',
   'zaa',
   'd',
   ' hai;n',
   ' te',
   're'],
  'en': ['·bo', 'l', '·kih', '·lab', '·ā', 'zā', 'd', '·haiñ', '·te', 're'],
  'ur': ['·بو', 'ل', '·کہ', '·لب', '·آ', 'زا', 'د', '·ہیں', '·تے', 'رے'],
  'hi': ['·बो', 'ल', '·कि', '·लब', '·आ', 'ज़ा', 'द', '·हैं', '·ते', 'रे']},
 2: {'orig': [' bo', 'l', ' za', 'baa;n', ' ab', ' tak', ' te', 'rii', ' hai'],
  'en': ['·bo', 'l', '·za', 'bāñ', '·ab', '·tak', '·te', 'rī', '·hai'],
  'ur': ['·بو', 'ل', '·ز', 'باں', '·اب', '·تک', '·تے', 'ری', '·ہے'],
  'hi': ['·बो', 'ल', '·ज़', 'बाँ', '·अब', '·तक', '·ते', 'री', '·है']},
 3: {'orig': [' te',
   'raa',
   ' sut',
   'vaa;n',
   ' jis',
   'm',
   ' hai',
   ' te',
   'raa'],
  'en': ['·te', 'rā', '·sut', 'vāñ', '·jis', 'm', '·hai', '·te', 'rā'],
  'ur': ['·تے', 'را', '·ست', 'واں', '·جس', 'م', '·ہے', '·تے', 'را'],
  'hi': ['·ते', 'रा', '·सुत', 'वाँ', '·जिस', 'म', '·है', '·ते', 'रा']},
 4: {'orig': [' bo',
   'l',
   ' kih',
   ' jaa;n'