In [None]:
# notebook for settings generation

In [None]:
import urdubiometer as ubm
from graphtransliterator import GraphTransliterator
urduGT = GraphTransliterator.from_yaml_file("../../transliterate/urdu.yml")
scholarlyGT = GraphTransliterator.from_yaml_file("../../transliterate/transliteration.yml")
hindiGT = GraphTransliterator.from_yaml_file('../../transliterate/devanagari.yaml')

In [4]:
import os, json
audio_filename = '../audio/faiz_bol.wav'
dat_target = '../waveform/faiz_bol.dat'
json_target = '../waveform/faiz_bol.json'
os.system('audiowaveform -i %s -o %s --pixels-per-second 20 --bits 8' % (audio_filename, dat_target))
os.system('audiowaveform -i %s -o %s' % (dat_target, json_target))
def normalize_waveform(filename):
    with open(filename, "r") as f:
        file_content = f.read()

    json_content = json.loads(file_content)
    data = json_content["data"]

    max_val = float(max(data))
    new_data = []
    for x in data:
        new_data.append(x/max_val)

    json_content["data"] = new_data
    file_content = json.dumps(json_content)

    with open(filename, "w") as f:
        f.write(file_content)
    return json_content
waveform = normalize_waveform(json_target)
#audiowaveform -i long_clip.mp3 -o long_clip.dat --pixels-per-second 20 --bits 8

In [None]:
waveform

In [5]:
lines = '''bol kih lab aazaad hai;n tere
bol zabaa;n ab tak terii hai
teraa sutvaa;n jism hai teraa
bol kih jaa;n ab tak terii hai
dekh kih aahangar kii dukaa;n me;n
tund hai;n shole sur;x hai aahan
khulne lage quflo;n ke dahaane
phailaa har ik zanjiir kaa daaman
bol yih tho;raa vaqt bahut hai
jism-o-zabaa;n kii maut se pahle
bol kih sach zindah hai ab tak
bol jo kuchh kahnaa hai kah le'''.split('\n')

lines_en = '''Speak, for your lips are free
Speak, for your tongue is still yours
Your long-suffering body is yours
Speak, for your life is still your own
Speak, for in the blacksmith's shop
The flames are fierce, the iron red
The mouths of locks have begun to open
The skirt of every chain is outspread
Speak, this little time is enough
Before the death of the body and tongue
Speak, for truth is still alive
Speak, say what must be said
'''.split("\n")

lines_ur = [urduGT.transliterate(line) for line in lines]
lines_trans = [scholarlyGT.transliterate(line) for line in lines]
lines_hi = [hindiGT.transliterate(line) for line in lines]

In [13]:
def show_arab(token):
    replacements = [ ('a', '^a'),
                     ('i', '^i'),
                     ('u', '^u'),
                     ('ai', '^ai'),
                     ('au', '^au'),
                     ('uu', '^uu')]
    for a, b in replacements:
        if token == a:
            token = b
    return token
def vocalize(line):

    return ''.join([show_arab(token) for token in urduGT.tokenize(line)])


In [14]:
scanner = ubm.DefaultScanner(meters_list=[{'id':0, 'name': "Faiz", "regex_pattern": "=(=|--)===(=|--)=="}])


In [15]:
scans = [scanner.scan(line, graph_details=True)[0] for line in lines]

In [17]:
labels_of = {} # "1" -> verse 1

for scan_id, scan in enumerate(scans):
    labels = {'orig': [], 'en': [], 'ur': [], 'hi': []}
    for match in scan.matches:
        orig_tokens_str = ''.join(match.orig_tokens)
        tokens_ur_w_arabs = ''.join([show_arab(_) for _ in match.orig_tokens])
        labels['orig'].append(orig_tokens_str)
        labels['en'].append(scholarlyGT.transliterate(orig_tokens_str).replace(" ", "·"))
        labels['ur'].append(urduGT.transliterate(tokens_ur_w_arabs).replace(" ", "\u200f·"))#//&#8207;
        labels['hi'].append(hindiGT.transliterate(orig_tokens_str).replace(" ", "·")   )
    labels_of[scan_id+1] = labels

In [18]:
tokens = set()
for scan in scans:
    for match in scan.matches:
        for _ in match.orig_tokens:
            tokens.add(_)
token_trans = {}
vowels_to_mark = set(['a', 'i', 'u', 'ii', 'au', 'ai'])
for token in tokens:
    token_trans[token] = {}
    
    token_trans[token]['en'] = scholarlyGT.transliterate(token)
    token_trans[token]['hi'] = hindiGT.transliterate(token)
    if token in vowels_to_mark:
        token_ur = '^' + token
    else:
        token_ur = token
    token_trans[token]['ur'] = urduGT.transliterate(token_ur)
    

In [19]:
settings = {
    'lines': {
        '_raw': lines,
        'ur': lines_ur,
        'en': lines_en,
        'hi': lines_hi
    },
    'scans': {str(i+1):scan for i, scan in enumerate(scans)},
    'labels_of': labels_of,
    'token_trans': token_trans,
    'base_graph': scanner._translation_graph.to_dict(),
    'intervals': [[0.0, 4.125714285714288, 'cut'], [4.125714285714288, 7.023696449108615, '1'], [8.320000000000002, 11.080045966382015, '2'], [12.651428571428573, 14.756200484663541, '3'], [15.73714285714286, 18.601745013831568, '4'], [19.611428571428576, 22.33145999836452, '5'], [22.33145999836452, 25.150750820636258, '6'], [27.55428571428572, 29.721899445393763, '7'], [30.37714285714286, 32.40006797221399, '8'], [32.994285714285716, 35.62116334050255, '9'], [36.377142857142864, 38.444232142704514, '10'], [39.92, 41.99027150736831, '11'], [43.17714285714286, 46.39093568842475, '12'], [46.39093568842475, 51.2, 'cut']],
    'peaks': waveform,
    'audio_file': 'audio/faiz_bol.mp4'    
}

import json
with open("../settings.js", "w") as f:
    f.write("settings="+json.dumps(settings))
