In [14]:

#!/usr/bin/env python3
"""
A file for creating a one-hot encoding of all characters, including madd and harakat, in Tarteel's Qur'an dataset.

The output pickle file will contain an object with the one-hot encoded Qur'an, an encoding function, and a decoding
function.

Author: Hamzah Khan
Date: Jan. 12, 2019
"""
from sklearn.model_selection import train_test_split
from collections import defaultdict
import copy
import dill as pickle
import json
import numpy as np

from argparse import ArgumentParser

parser = ArgumentParser(description='Tarteel Arabic One Hot Encoding Generator')
parser.add_argument('-i', '--input_json_path', type=str)
parser.add_argument('-o', '--output_pickle_path', type=str)
parser.add_argument('-v', '--verbose', type=bool, default=False)
args = parser.parse_args(['-i', 'data-uthmani.json', '-o', 'one-hot.pkl']) # You will need to replace input.json and output.pickle with the actual paths to your input and output files

# Define constants.
QURAN_KEY  = "quran"
SURAHS_KEY = "surahs"
AYAHS_KEY  = "ayahs"
TEXT_KEY   = "text"

NUM_KEY       = "num"
NAME_KEY      = "name"
BISMILLAH_KEY = "bismillah"

ENCODING_MAP_KEY = "encoding_map"
DECODING_MAP_KEY = "decoding_map"
CHAR_TO_INT_MAP_KEY = "char_to_int"
INT_TO_CHAR_MAP_KEY = "int_to_char"


In [15]:
def create_list_of_quranic_chars(quran_obj, surahs_key=SURAHS_KEY, ayahs_key=AYAHS_KEY, text_key=TEXT_KEY):
    """
    Create a sorted list containing every character in the Qur'an text provided and return it.
    :param quran_obj: An object containing surah objects.
    :type quran_obj: object
    :param surahs_key: The key in quran_obj to the list of surah objects.
    :type surahs_key: string
    :param ayahs_key: The key in each surah object to the list of ayah objects in that surah.
    :type ayahs_key: string
    :param text_key: The key to the actual Qur'anic text in each ayah object.
    :type text_key: string
    :returns: A sorted list containing every Arabic character in the Qur'an exactly once.
    :rtype: list string
    """
    quranic_char_set = set()

    for surah_obj in quran_obj[surahs_key]:
        for ayah_obj in surah_obj[ayahs_key]:
            ayah_text = ayah_obj[text_key]

            for char in ayah_text:
                quranic_char_set.add(char)

    return sorted(list(quranic_char_set))

In [31]:
def encode_char_as_one_hot(string, char_to_int):
    """
    Converts a string of characters from our alphabet into a one_hot encoded string.
    """
    str_len = len(string)
    int_list = np.array([char_to_int[char] for char in string])

    one_hot_string = np.zeros((str_len, len(char_to_int)))
    one_hot_string[np.arange(str_len), int_list] = 1

    return one_hot_string

def create_one_hot_encoding(quranic_char_list):
    """
    Creates a one-hot encoding that associates each character in the argument list to a number and vice versa.
    :param quranic_char_list: A list of characters.
    :type quranic_char_list: list string
    :returns: A tuple containing the encoding and decoding functions for the alphabet.
    :rtype: tuple (function string => int, function int => string)
    """

    # Define an encoding of characters to integers.
    char_to_int = dict((c, i) for i, c in enumerate(quranic_char_list))
    int_to_char = dict((i, c) for i, c in enumerate(quranic_char_list))

    print('quranic_char_list: ', quranic_char_list)

    print("char_to_int: ", char_to_int)
    print("int_to_char: ", int_to_char)





    def decode_one_hot_as_string(one_hot_string, int_to_char):
        """
        Converts a one_hot encoded numpy array back into a string of characters from our alphabet.
        """
        int_list = list(np.argmax(one_hot_string, axis=1))
        char_list = [int_to_char[integer] for integer in int_list]

        return str(char_list)

    return char_to_int, int_to_char, encode_char_as_one_hot, decode_one_hot_as_string


def generate_a_one_hot_encoded_script(quran_obj,
                                      encoding_fn,
                                      surahs_key=SURAHS_KEY,
                                      ayahs_key=AYAHS_KEY,
                                      text_key=TEXT_KEY,
                                      num_key=NUM_KEY,
                                      name_key=NAME_KEY,
                                      bismillah_key=BISMILLAH_KEY):
    """
    Translates each ayah in the given quran_obj into a vector of one-hot encoded characters using the given encoding.
    Create a sorted list containing every character in the Qur'an text provided and return it.
    :param quran_obj: An object containing surah objects.
    :type quran_obj: object
    :param quran_obj: A function that converts Arabic Qur'anic characters to a one-hot encoding.
    :type quran_obj: function (Arabic string => numpy 2darray)
    :param surahs_key: The key in quran_obj to the list of surah objects.
    :type surahs_key: string
    :param ayahs_key: The key in each surah object to the list of ayah objects in that surah.
    :type ayahs_key: string
    :param text_key: The key to the actual Qur'anic text in each ayah object.
    :type text_key: string
    :param num_key: The key in surah and ayah objects to the ordering of the surah or ayah.
    :type num_key: string
    :param name_key: The key in each surah object to the name of that surah.
    :type name_key: string
    :param bismillah_key: The key to the bismillah text in the first ayah object of each surah object.
    :type bismillah_key: string
    :returns: An object identical to the quran_obj but with one-hot encodings of all Arabic text (not names).
    :rtype: object
    """
    one_hot_quran_encoding = {}
    one_hot_quran_encoding[SURAHS_KEY] = []

    for surah_obj in quran_obj[surahs_key]:
        # Copy new surah object for one-hot Json container.
        one_hot_surah_obj            = {}
        one_hot_surah_obj[num_key]   = surah_obj[num_key]
        one_hot_surah_obj[name_key]  = surah_obj[name_key]
        one_hot_surah_obj[ayahs_key] = []

        for ayah_obj in surah_obj[ayahs_key]:
            ayah_text = ayah_obj[text_key]

            # Make new ayah object for one-hot Json container.
            one_hot_ayah_obj           = {}
            one_hot_ayah_obj[num_key]  = ayah_obj[num_key]
            one_hot_ayah_obj[text_key] = encoding_fn(ayah_text)

            if bismillah_key in ayah_obj:
                one_hot_ayah_obj[bismillah_key] = encoding_fn(ayah_obj[bismillah_key])

            one_hot_surah_obj[ayahs_key].append(one_hot_ayah_obj)
        one_hot_quran_encoding[surahs_key].append(one_hot_surah_obj)

    return one_hot_quran_encoding

In [38]:

# try:
with open(args.input_json_path, 'rb') as quran_json_file:

    # Import json file.
    quran_obj = json.load(quran_json_file)[QURAN_KEY]
#
# except:
#     print("Json file failed to open. Exiting script...")
#     return

# Get the list of every character in the Qur'an.
quranic_char_list = create_list_of_quranic_chars(quran_obj)

if args.verbose:
    print(quranic_char_list, ' has ', len(quranic_char_list), ' characters.')

encode_char_as_one_hot("بسم الله الرحمن الرحيم", dict((c, i) for i, c in enumerate(quranic_char_list)))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
def run_script(args):
    """
    Runs the script to find all characters, generate the encoding, and translate and store it in the output file.
    """
    # try:
    with open(args.input_json_path, 'rb') as quran_json_file:

        # Import json file.
        quran_obj = json.load(quran_json_file)[QURAN_KEY]
    #
    # except:
    #     print("Json file failed to open. Exiting script...")
    #     return

    # Get the list of every character in the Qur'an.
    quranic_char_list = create_list_of_quranic_chars(quran_obj)

    if args.verbose:
        print(quranic_char_list, ' has ', len(quranic_char_list), ' characters.')

    # Create the one-hot encodings.
    char_to_int_map, \
    int_to_char_map, \
    encode_char_as_one_hot, \
    decode_one_hot_as_string = create_one_hot_encoding(quranic_char_list)

    if args.verbose:
        print("encode!")
        x = encode_char_as_one_hot("".join(quranic_char_list))
        print(x)
        print("decode!")
        print(decode_one_hot_as_string(x))

    # Generate the Qur'anic text in one-hot encoding.
    one_hot_quran_encoding = generate_a_one_hot_encoded_script(
        quran_obj,
        lambda string: encode_char_as_one_hot(string, char_to_int_map))

    # Create an object with the encoding and the two functions.
    full_object = {
        QURAN_KEY: one_hot_quran_encoding,
        ENCODING_MAP_KEY: encode_char_as_one_hot,
        DECODING_MAP_KEY: decode_one_hot_as_string,
        CHAR_TO_INT_MAP_KEY: char_to_int_map,
        INT_TO_CHAR_MAP_KEY: int_to_char_map
    }

    with open(args.output_pickle_path, 'wb') as one_hot_quran_pickle_file:
        pickle.dump(full_object, one_hot_quran_pickle_file)



In [18]:
def load_data(pickle_file):
    """
    A sample function to demonstrate how to load the object.
    """
    try:
        with open(pickle_file, 'rb') as one_hot_pickle:
            one_hot_obj = pickle.load(one_hot_pickle)

            print('Now, we can do things with it! Keys: ', one_hot_obj.keys())

    except:
        print("Pickle file failed to open. Exiting...")
        return


if __name__ == "__main__":
    run_script(args)

quranic_char_list:  [' ', 'ء', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ـ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي', 'ً', 'ٌ', 'ٍ', 'َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ٓ', 'ٔ', 'ٰ', 'ٱ', 'ۜ', '۟', 'ۢ', 'ۣ', 'ۥ', 'ۦ', 'ۨ', '۪', '۫', '۬']
char_to_int:  {' ': 0, 'ء': 1, 'أ': 2, 'ؤ': 3, 'إ': 4, 'ئ': 5, 'ا': 6, 'ب': 7, 'ة': 8, 'ت': 9, 'ث': 10, 'ج': 11, 'ح': 12, 'خ': 13, 'د': 14, 'ذ': 15, 'ر': 16, 'ز': 17, 'س': 18, 'ش': 19, 'ص': 20, 'ض': 21, 'ط': 22, 'ظ': 23, 'ع': 24, 'غ': 25, 'ـ': 26, 'ف': 27, 'ق': 28, 'ك': 29, 'ل': 30, 'م': 31, 'ن': 32, 'ه': 33, 'و': 34, 'ى': 35, 'ي': 36, 'ً': 37, 'ٌ': 38, 'ٍ': 39, 'َ': 40, 'ُ': 41, 'ِ': 42, 'ّ': 43, 'ْ': 44, 'ٓ': 45, 'ٔ': 46, 'ٰ': 47, 'ٱ': 48, 'ۜ': 49, '۟': 50, 'ۢ': 51, 'ۣ': 52, 'ۥ': 53, 'ۦ': 54, 'ۨ': 55, '۪': 56, '۫': 57, '۬': 58}
int_to_char:  {0: ' ', 1: 'ء', 2: 'أ', 3: 'ؤ', 4: 'إ', 5: 'ئ', 6: 'ا', 7: 'ب', 8: 'ة', 9: 'ت', 10: 'ث', 11: 'ج', 12: 'ح', 13: 'خ', 14: 'د', 15: 'ذ',

In [19]:
try:
    with open('one-hot.pkl', 'rb') as one_hot_pickle:
        one_hot_obj = pickle.load(one_hot_pickle)
except:
    print("Pickle file failed to open. Exiting...")


one_hot_quran    = one_hot_obj[QURAN_KEY]
str_to_onehot_fn = one_hot_obj[ENCODING_MAP_KEY]
onehot_to_str_fn = one_hot_obj[DECODING_MAP_KEY]
char_to_int_map  = one_hot_obj[CHAR_TO_INT_MAP_KEY]
int_to_char_map  = one_hot_obj[INT_TO_CHAR_MAP_KEY]

encoding_fn = lambda string: str_to_onehot_fn(string, char_to_int_map)
decoding_fn = lambda one_hot: onehot_to_str_fn(one_hot, int_to_char_map)

In [20]:
def get_verse_in_quran_obj(one_hot_quran, surah_num, ayah_num):
    """
    Looks up and returns the (encoded or decoded) string in the quran object.
    """
    return one_hot_quran[SURAHS_KEY][surah_num][AYAHS_KEY][ayah_num][TEXT_KEY]

In [21]:
one_hot_ayah = get_verse_in_quran_obj(one_hot_quran, 1, 1)
string_ayah = decoding_fn(one_hot_ayah)
print(string_ayah)

['ذ', 'َ', 'ٰ', 'ل', 'ِ', 'ك', 'َ', ' ', 'ٱ', 'ل', 'ْ', 'ك', 'ِ', 'ت', 'َ', 'ٰ', 'ب', 'ُ', ' ', 'ل', 'َ', 'ا', ' ', 'ر', 'َ', 'ي', 'ْ', 'ب', 'َ', ' ', 'ف', 'ِ', 'ي', 'ه', 'ِ', ' ', 'ه', 'ُ', 'د', 'ً', 'ى', ' ', 'ل', 'ّ', 'ِ', 'ل', 'ْ', 'م', 'ُ', 'ت', 'ّ', 'َ', 'ق', 'ِ', 'ي', 'ن', 'َ']


In [22]:
split_unique_ayahs = True
ayah_nums = []
if split_unique_ayahs:
    unique_ayahs = defaultdict(list)
    for surah_num in range(1):
        for ayah_num in range(len(one_hot_quran[SURAHS_KEY][surah_num][AYAHS_KEY])):
            one_hot_ayah = get_verse_in_quran_obj(one_hot_quran, surah_num, ayah_num)
            string_ayah = decoding_fn(one_hot_ayah)
            unique_ayahs[string_ayah].append((surah_num, ayah_num))

    for ayah_string in unique_ayahs:
        identical_ayah_list = unique_ayahs[ayah_string]
        ayah_nums.append(identical_ayah_list)

else:
    for surah_num in range(1):
        for ayah_num in range(len(one_hot_quran[SURAHS_KEY][surah_num][AYAHS_KEY])):
            ayah_nums.append(ayah_num)

In [23]:
ayah_nums

[[(0, 0)], [(0, 1)], [(0, 2)], [(0, 3)], [(0, 4)], [(0, 5)], [(0, 6)]]

In [24]:
# This gives us a 60-20-20 split.
RANDOM_SEED = 1
TRAIN_SPLIT = 0.6
TEST_SPLIT  = 0.2
VALIDATION_SPLIT = 0.2
# Logic for splits
split1_percent = TRAIN_SPLIT + VALIDATION_SPLIT
split2_percent = 1.0 - (VALIDATION_SPLIT / split1_percent)

X_train_valid, X_test = train_test_split(ayah_nums,
                                          train_size=split1_percent,
                                          random_state=RANDOM_SEED,
                                          shuffle=True)
X_train, X_valid = train_test_split(X_train_valid,
                                    train_size=split2_percent,
                                    random_state=RANDOM_SEED,
                                    shuffle=True)
print(X_train)
print(X_test)
print(X_valid)

[[(0, 5)], [(0, 1)], [(0, 3)]]
[[(0, 6)], [(0, 2)]]
[[(0, 4)], [(0, 0)]]
