# Imports

In [3]:
# FOR GOOGLE COLAB ONLY
mount_dir = "/content/drive"

In [4]:
# FOR GOOGLE COLAB ONLY
from IPython.display import clear_output

# !pip install urduhack
!pip install deep_translator
clear_output()

from google.colab import drive

drive.mount(mount_dir)

# install the package
import sys

sys.path.append(f'{mount_dir}/Othercomputers/mac19/sign-language-translator')

# change directory
%cd "/content/drive/Othercomputers/mac19/personal_repos/slt/notebooks/data_collection"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Othercomputers/My MacBook Pro/sign_language_translator/notebooks/data_collection


In [5]:
import ast
import json
import os
import re
from glob import glob

import pandas as pd
from tqdm.auto import tqdm
from collections import Counter

tqdm.pandas()
from deep_translator import GoogleTranslator
from IPython.display import clear_output

from sign_language_translator.data_collection.synonyms import make_translations

pd.set_option('display.max_colwidth', None)
gt = GoogleTranslator(source="auto", target="de")
gt_supported_langs = list(gt.get_supported_languages(as_dict=True).values())

import random

random.seed(0)
random.shuffle(gt_supported_langs)
clear_output()

In [6]:
# !pip install urduhack
import json

from urduhack.normalization import (normalize, normalize_characters,
                                    normalize_combine_characters)
from urduhack.normalization.character import CORRECT_URDU_CHARACTERS

clear_output()

# Synonyms by translation

In [None]:
import sign_language_translator as slt

syn_finder = slt.text.SynonymFinder("en")

print(syn_finder.synonyms_by_translation("hello", timeout=0.1))

In [None]:
# # MAKE TABLE
# df = pd.DataFrame({"labels": recording_labels["pk-hfad-1"]})
# df.to_csv('translations.csv', index=False)

In [None]:
# SELECT ROWS TO TRANSLATE
df = pd.read_csv('translations.csv')
valid_rows = ~(
    df['labels'].str.contains('-handed-')
    | ((df['labels'].str.len() == 1) & ~df['labels'].str.isdigit())
    | (df['labels']=='آ(حرف)')
)
sum(valid_rows)

748

In [None]:
# MAKE TEXT LIST TO TRANSLATE
if not os.path.isdir("synonyms"):
    os.makedirs("synonyms")
with open("synonyms/words.txt", 'w') as f:
    f.write('\n'.join(df.loc[valid_rows, 'labels'].to_list()))

In [None]:
# TRANSLATE INTO INTERMEDIATE LANGUAGES
intermediate_languages = [lang for lang in gt_supported_langs if not os.path.exists(f'synonyms/{lang}_from_words.txt')]
args_list = [('synonyms/words.txt', lang, 0.25) for lang in intermediate_languages]

make_translations(args_list, n_processes=10)
clear_output()

In [None]:
# TRANSLATE TO TARGET LANGUAGE
target_languages = ['en', 'ur']
args_list = [(f'synonyms/{ilang}_from_words.txt', tlang, 0.25) for ilang in gt_supported_langs for tlang in target_languages if not os.path.exists(f'synonyms/{tlang}_from_{ilang}_from_words.txt') and os.path.exists(f'synonyms/{ilang}_from_words.txt')]

make_translations(args_list, n_processes=10)
clear_output()

In [None]:
# COMPILE TRANSLATIONS
target_languages = ['en', 'ur']
synonyms = {lang:[] for lang in ['words']+target_languages}

# makes table with {words: [words], en:[counters], ur:[counters]}
with open('synonyms/words.txt', 'r') as f:
    for i, line in enumerate(f.read().splitlines()):
        synonyms['words'].append(line)
        for lang in target_languages:
            synonyms[lang].append(dict())

# count synonyms
for lang in target_languages:
    synonyms_fpaths = glob(f'synonyms/{lang}_from_*_from_words.txt')
    for fpath in synonyms_fpaths:
        with open(fpath, 'r') as f:
            for i, line in enumerate(f.read().splitlines()):
                line = line.lower()
                if line not in synonyms[lang][i]:
                    synonyms[lang][i][line]=0
                synonyms[lang][i][line] += 1

# sort synonyms by frequency
for lang in target_languages:
    for i in range(len(synonyms[lang])):
        synonyms[lang][i] = [
            w
            for w, count in sorted(
                synonyms[lang][i].items(),
                key=lambda item:item[1],
                reverse=True
            )
        ]

# make table
synonyms_df = pd.DataFrame(synonyms).rename(
    columns={lang:f'{lang}_synonyms' for lang in target_languages}
)
# synonyms_df.to_csv('synonyms.csv', index=False)
synonyms_df.sample(10)

In [13]:
df = pd.read_csv('synonyms.csv')

# Synonyms by embeddings

In [86]:
embedding_model_path = '../../slt_ai/slt_ai/static/txt/W2V300dim5winBulk.txt'

from gensim.models import KeyedVectors
ur_embd_model = KeyedVectors.load_word2vec_format(embedding_model_path)

ur_vocab = set()
with open(embedding_model_path, 'r') as f:
    next(f)
    for line in f:
        ur_vocab.add(line.split()[0])
len(ur_vocab), sum((w==normalize(w) for w in ur_vocab))/len(ur_vocab)

(461449, 0.9484775132246467)

In [None]:
df.loc[~df.words.isna(), 'ur_similar'] = df.loc[~df.words.isna(), 'words'].progress_apply(
    lambda x: set(
        [w for w, _ in ur_embd_model.most_similar(
            re.sub('\(.*\)', '', x).replace('-', ''),
            topn=15)]
        if re.sub('\(.*\)', '', x).replace('-', '') in ur_embd_model else []))

100%|██████████| 748/748 [00:20<00:00, 35.90it/s] 


# view

In [106]:
word = "سلیٹی"
print(f'{len(word) = }')
print(normalize(word))
print(normalize(word) == word)
print(set(word.split(' ')) <= ur_vocab)
print(set(word)-set(CORRECT_URDU_CHARACTERS))
print('\n'.join(word))

len(word) = 5
سلیٹی
True
True
set()
س
ل
ی
ٹ
ی


In [None]:
df.sort_values('labels')[820:].head()[['labels', 'en_synonyms']]

In [None]:
word == 'کنجوسی'

True

In [None]:
ur_embd_model.most_similar('آجکل')

# ReView

In [None]:
DRP = "/Users/mudassar.iqbal/Library/CloudStorage/GoogleDrive-mdsriqb@gmail.com/My Drive/sign-language-translator/sign-language-datasets"

In [293]:
# STATS
filepath = DRP+"/sign_recordings/collection_to_label_to_language_to_words.json"
collection = "pk-hfad-1"

with open(filepath, 'r') as f:
    mapper = json.load(f)

videos  = list(mapper[collection].keys())
english = [word for group in mapper[collection].values() for word in group["english"]]
urdu    = [word for group in mapper[collection].values() for word in group["urdu"]]
done_en = [k for k,v in mapper[collection].items() if len([w for w in v['english'] if w!="_______"]) > 0]
done_ur = [k for k,v in mapper[collection].items() if len([w for w in v['urdu'] if w!="_______"]) > 0]
eng_unique = {re.sub(r"\(.*\)", "", word) for word in english}
urdu_unique = {re.sub(r"\(.*\)", "", word) for word in urdu}

print(f'{len(videos) = }\n{len(done_en) = }\t{len(done_en)/len(videos):.1%}\n{len(done_ur) = }\t{len(done_ur)/len(videos):.1%}\n{len(english) = }\t{len(english)/len(videos):.2f}x\t{len(eng_unique) = }\t{len(eng_unique)/len(videos):.2f}x\n{len(urdu) = }\t{len(urdu)/len(videos):.2f}x\t{len(urdu_unique) = }\t{len(urdu_unique)/len(videos):.2f}x')

assert len(set(english)) == len(english), {k:v for k,v in Counter(english).items() if v > 1}
assert len(set(urdu))    == len(urdu),    {k:v for k,v in Counter(urdu).items()    if v > 1}

unnormalized = [word for word in urdu if normalize_characters(word) != word]
assert len(unnormalized) == 0, unnormalized
unnormalized = [word for word in urdu if normalize_combine_characters(word) != word]
assert len(unnormalized) == 0, unnormalized
unnormalized = [word for word in urdu if normalize(word) != word]
# assert len(unnormalized) == 0, unnormalized # allow diacrits
print(f"{len(unnormalized) = }")#, unnormalized

constructable = [group['components'] for group in mapper[collection].values() if "components" in group]
print(f"{len(constructable) = }")
components = [comp for comps in constructable for comp in comps if comp.replace(f"{collection}/", "") not in mapper[collection]]
assert len(components) == 0, components

len(videos) = 788
len(done_en) = 788	100.0%
len(done_ur) = 788	100.0%
len(english) = 1580	2.01x	len(eng_unique) = 1466	1.86x
len(urdu) = 1950	2.47x	len(urdu_unique) = 1883	2.39x
len(unnormalized) = 41
len(constructable) = 35


In [None]:
ambiguous_to_contexted = {"keys":{}, "english":{}, "urdu":{}}
for lang, words in [("keys", videos), ('english', english), ('urdu', urdu),]:
    for word in words:
        if "(" in word:
            ambiguous = re.sub(r'\(.*\)', '', word)
            if ambiguous not in ambiguous_to_contexted[lang]:
                ambiguous_to_contexted[lang][ambiguous] = []
            ambiguous_to_contexted[lang][ambiguous].append(word)

# drop words without any alternates (only 1 contexted option)
for lang in ambiguous_to_contexted:
    amb_words = list(ambiguous_to_contexted[lang].keys())
    for amb in amb_words:
        if len(ambiguous_to_contexted[lang][amb]) == 1:
            ambiguous_to_contexted[lang].pop(amb)

print(json.dumps(ambiguous_to_contexted, indent=4, ensure_ascii=False, sort_keys=True))

# Names

In [423]:
# https://www.cle.org.pk/information/people/hudasarfraz/CR0318E.pdf
# https://www.urdupoint.com/names/boys-islamic-names-urdu.html
# https://www.urdupoint.com/names/girls-islamic-names-urdu.html

In [458]:
df = pd.read_csv("names.csv")
df.sample(10)

Unnamed: 0,english_name,urdu_name,gender,frequency
306,rizwan,رضوان,M,
231,faheem,فہیم,M,
36,farida,فریدہ,F,272.0
339,taimoor,تیمور,M,
113,rabbia,ربیعہ,F,117.0
276,mudassar,مدثر,M,119.0
63,iram,ارم,F,
270,majeed,مجید,M,2521.0
106,neha,نیہا,F,
157,syeda,سیدہ,F,331.0


In [445]:
with open('names.json', 'w') as f:
    json.dump({
        "english": df.english_name.to_list(),
        "urdu": df.urdu_name.to_list(),
    }, f, indent=4, ensure_ascii=False)

# urdu to roman-urdu

In [None]:
# python>=3.9,<=3.11
!pip install sign-language-translator

In [2]:
import json
import re

import sign_language_translator as slt

In [39]:
lang = slt.languages.text.Urdu()
word_list = lang.vocab.supported_words
word_to_translations = {}
raw_responses = {}

In [46]:
for _ in range(5):
    for word in word_list:
        prompt = f"""translate the following text and return only a json object (you must not say anything else!):

        {{
            "source-{lang.name()}-text": {word},
            "all-distinct-possible-translated-hindi-texts": [
                "...",
                ...
            ]
        }}
        """
        # print(prompt)

        # TODO: response = requests.get("https://openai.com/...", data={"prompt": prompt}).json()["completion"]
        # if word not in raw_responses:
        #     raw_responses[word] = []
        # raw_responses[word].append(response)
        response = '["jungle"]'

        if word not in word_to_translations:
            word_to_translations[word] = set()

        try:
            parsed_list = json.loads(re.findall(r"\[.*\]", response, flags=re.DOTALL)[0])
            word_to_translations[word] = set(word_to_translations[word]) | {str(x).lower() for x in parsed_list if str(x).isascii()}
        except KeyboardInterrupt:
            break
        except Exception as e:
            print(e)

        # break

In [47]:
for k in word_to_translations:
    word_to_translations[k] = sorted(word_to_translations[k])

# with open("word_to_translations.json", "w") as f:
#     json.dump(word_to_translations, f, indent=4, ensure_ascii=False)

word_to_translations

{'جنگل': ['jungle'],
 'جنوری': ['jungle'],
 'دانشمند': ['jungle'],
 'چائنہ': ['jungle'],
 'تیری': ['jungle'],
 'بے سکون': ['jungle'],
 'چاہتی': ['jungle'],
 'پہ': ['jungle'],
 'وکالت': ['jungle'],
 'فرزند': ['jungle'],
 'گوری': ['jungle'],
 'شرمندہ': ['jungle'],
 'پاگل': ['jungle'],
 'جلدی': ['jungle'],
 'احمق': ['jungle'],
 'او': ['jungle'],
 'دعوت': ['jungle'],
 'گلیاں': ['jungle'],
 'مرغی کا گوشت': ['jungle'],
 'ملک': ['jungle'],
 'عمل': ['jungle'],
 'اسٹیڈیم': ['jungle'],
 'سمندری': ['jungle'],
 'پودوں': ['jungle'],
 'بھتیجے': ['jungle'],
 'بس اسٹاپ': ['jungle'],
 'کچھ': ['jungle'],
 'پوری': ['jungle'],
 'خوف ناک': ['jungle'],
 'مزے': ['jungle'],
 'اسٹابری': ['jungle'],
 'فتح': ['jungle'],
 'فرمانبرداری': ['jungle'],
 'آوں': ['jungle'],
 'دروازے': ['jungle'],
 'محمد مصطفی صلی اللہ علیہ وآلہ وسلم': ['jungle'],
 'آسانی': ['jungle'],
 'رحمت': ['jungle'],
 'دو': ['jungle'],
 'گلشن': ['jungle'],
 'بننا': ['jungle'],
 'آخری': ['jungle'],
 'بوڑھا آدمی': ['jungle'],
 'قانونی': ['jungle'],
