# Imports

In [2]:
import re
import json
from collections import Counter, OrderedDict
from importlib import reload
from IPython.display import clear_output
from copy import deepcopy

from tqdm.auto import tqdm
from urduhack.normalization import (
    normalize,
    normalize_characters,
    normalize_combine_characters,
)
from urduhack.urdu_characters import URDU_ALL_CHARACTERS, URDU_ALPHABETS, URDU_DIACRITICS, URDU_DIGITS

from sign_language_translator.text import preprocess, tokens, vocab

reload(vocab), reload(tokens), reload(preprocess)

stok = tokens.SignTokenizer(vocab=vocab.UNCONTEXTED_VOCAB["urdu"], drop_spaces=False)
extra_allowed_symbols = set(vocab.SYMBOLS)

# char_counts = Counter(all_text)
# [item for item in char_counts.most_common() if (item[0] not in (URDU_ALPHABETS | URDU_DIACRITICS | URDU_DIGITS |set(vocab.PUNCTUATION))) and (not item[0].lower().islower())]
("—" == '-'), ("ـ" == "_"), '”' in URDU_ALL_CHARACTERS
clear_output()

## Load Data

In [None]:
DRP = "/Users/mudassar.iqbal/Library/CloudStorage/GoogleDrive-mdsriqb@gmail.com/My Drive/sign-language-translator/sign-language-datasets"

In [6]:
dataset = {}
with open(f'{DRP}/text_corpora/raw_corpora/wikipedia.json', 'r') as f:
    dataset["wikipedia"] = {
        preprocess.urdu_wikipedia_preprocessor(
            preprocess.urdu_text_normalization(line)
        )
        for line in json.load(f)
    }
with open(f'{DRP}/text_corpora/raw_corpora/glosbe.json', 'r') as f:
    dataset["glosbe"] = {
        preprocess.urdu_text_normalization(line)
        for line in json.load(f)
    }
with open(f'{DRP}/text_corpora/raw_corpora/rekhta_categories.json', 'r') as f:
    rekhta_categories = json.load(f)
with open(f'{DRP}/text_corpora/raw_corpora/rekhta.json', 'r') as f:
    rekhta = json.load(f)
    rekhta_preprocessor = {
        "poetry": preprocess.urdu_poetry_preprocessor,
        "passage": preprocess.urdu_passage_preprocessor,
    }
    for group, keys in rekhta_categories.items():
        dataset[group] = {
            rekhta_preprocessor[group](
                preprocess.urdu_text_normalization(text)
            )
            for key in keys
            for text in rekhta[key]
        }
    del rekhta

print(json.dumps({k:{"n_texts":len(v), "n_words":sum([t.count(' ') for t in v])} for k,v in dataset.items()}, indent=4))

{
    "wikipedia": {
        "n_texts": 2648223,
        "n_words": 77735304
    },
    "glosbe": {
        "n_texts": 100938,
        "n_words": 2204129
    },
    "poetry": {
        "n_texts": 321710,
        "n_words": 4782979
    },
    "passage": {
        "n_texts": 104394,
        "n_words": 15123833
    }
}


## Extract supported substrings

In [7]:
extracts = {}

In [None]:
data_name = "wikipedia"
counts = Counter((
    substr.strip()
    for txt in dataset[data_name]
    for substr in stok.extract_supported_substrings(
        txt,
        extra_allowed_symbols=extra_allowed_symbols - {"بن", "آ"} #-{" "}
    )
    if len(stok.tokenize(substr.strip())) > 1
))
sorted_counts = sorted(counts.most_common(), key=lambda item: (item[0].count(' '), item[1], item[0]), reverse=True)
sum(counts.values()), len(counts), sorted_counts

In [16]:
sentence_lengths = [i[0].count(' ') for i in sorted_counts]
sentence_length_counts = Counter(sentence_lengths)
sorted(sentence_length_counts.most_common())

[(0, 7478),
 (1, 99321),
 (2, 222926),
 (3, 206347),
 (4, 125337),
 (5, 65907),
 (6, 33831),
 (7, 17364),
 (8, 8774),
 (9, 4734),
 (10, 2643),
 (11, 1385),
 (12, 818),
 (13, 445),
 (14, 258),
 (15, 139),
 (16, 79),
 (17, 37),
 (18, 37),
 (19, 20),
 (20, 11),
 (21, 8),
 (22, 4),
 (23, 2),
 (24, 1),
 (25, 1),
 (26, 1),
 (29, 1),
 (30, 1),
 (35, 1)]

In [18]:
extracts[data_name] = deepcopy(sorted_counts)

In [20]:
data_names = ['poetry', "passage", "glosbe", "wikipedia"] # dataset.keys()
extracts = OrderedDict({k:OrderedDict(extracts[k]) for k in data_names})

In [21]:
with open('../../sign_language_translator/datasets/text_corpora/supported_substrings_frequency.json', "w") as f:
    json.dump(extracts, f, indent=2, ensure_ascii=False)

In [None]:
del extracts
del counts
del sorted_counts

## Improve Vocab

In [None]:
[(w, normalize(w)) for w in vocab.UNCONTEXTED_VOCAB['urdu'] if normalize(w) not in vocab.UNCONTEXTED_VOCAB['urdu'] and w not in vocab.SPELLED_WORDS['urdu']]

[('اُسکے', 'اسکے'),
 ('اُسکو', 'اسکو'),
 ('اِنہی', 'انہی'),
 ('اِنہوں', 'انہوں'),
 ('اِنہیں', 'انہیں')]

In [81]:
# x = {v for val in dataset.values() for v in val}
x = dataset['passage'] | dataset['poetry']
token_counts = Counter(stok.tokenize(' '.join(x)))
sorted_token_counts = token_counts.most_common()
del x

remaining_words = [item for item in sorted_token_counts if item[0] not in vocab.UNCONTEXTED_VOCAB['urdu']]
supported_words = [item for item in sorted_token_counts if item[0]     in vocab.UNCONTEXTED_VOCAB['urdu']]
supported_words += sorted([(w,0) for w in vocab.UNCONTEXTED_VOCAB['urdu'] if w not in token_counts], key=lambda item:item[0])

min_count = 10
with open('temp/rekhta_remaining_words_frequency.json', 'w') as f:
    json.dump(OrderedDict(
        sorted(
            [item for item in remaining_words if item[1] >= min_count and not (item[0].lower().islower() and len(item[0])>1)],
            key= lambda i: (i[1], i[0]), reverse = True,
        )
    ), f, indent=4, ensure_ascii=False)

# with open('temp/supported_words_frequency.json', 'w') as f:
#     json.dump(OrderedDict(supported_words), f, indent=4, ensure_ascii=False)