In [581]:
import pandas as pd
import random

In [582]:
def prepare_mcq_outputs(text_en, text_tl, mcq_options, row={}, kwargs={}):
    outputs = {
        "prompts": [{
            "text_en": text_en.format(**row, **kwargs),
            "text_tl": text_tl.format(**row, **kwargs),
            "mcq_options": mcq_options,
        }],
    }
    return outputs

In [583]:
def string2chars(string):
    return list(string)

def chars2string(char_list, add_space=False):
    if add_space:
        return ' '.join(char_list)
    return ''.join(char_list)

def get_random_char(string):
    return random.choice(list(string))

def same_string(string):
    return string

def delete_char(string, char_to_delete=None):
    if char_to_delete is None:
        char_to_delete = get_random_char(string)

    char_list = string2chars(string)
    return chars2string([char for char in char_list if char != char_to_delete])

def insert_char(string, preceding_char=None, char_to_insert=None):
    if preceding_char is None:
        preceding_char = get_random_char(string)
    if char_to_insert is None:
        char_to_insert = random.choice('abcdefghijklmnopqrstuvwxyz')

    result = []
    char_list = string2chars(string)
    for char in char_list:
        result.append(char)
        if char == preceding_char:
            result.append(char_to_insert)
    return chars2string(result)

def substitute_char(string, char_to_replace=None, char_to_substitute=None):
    if char_to_replace is None:
        char_to_replace = get_random_char(string)
    if char_to_substitute is None:
        remaining_chars = 'abcdefghijklmnopqrstuvwxyz'.replace(char_to_replace, '')
        char_to_substitute = get_random_char(remaining_chars)

    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        if char == char_to_replace:
            result.append(char_to_substitute)
        else:
            result.append(char)
    return chars2string(result)

def permute_char(string, char1=None, char2=None):
    if char1 is None:
        char1 = get_random_char(string)
    if char2 is None:
        remaining_string = string.replace(char1, '')
        if remaining_string:
            char2 = get_random_char(remaining_string)
        else:
            char2 = char1

    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        if char == char1:
            result.append(char2)
        elif char == char2:
            result.append(char1)
        else:
            result.append(char)
    return chars2string(result)

def duplicate_char(string, char_to_duplicate=None):
    if char_to_duplicate is None:
        char_to_duplicate = get_random_char(string)

    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        result.append(char)
        if char == char_to_duplicate:
            result.append(char)
    return chars2string(result)

def uppercase_char(string):
    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        result.append(char.upper())
    return chars2string(result)

def lowercase_char(string):
    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        result.append(char.lower())
    return chars2string(result)

def randomly_uppercase_char(string):
    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        if random.random() < 0.5:
            result.append(char.upper())
        else:
            result.append(char.lower())
    return chars2string(result)

def uppercase_first_half_char(string):
    result = [] 
    char_list = string2chars(string)
    half_index = len(char_list) // 2
    for i, char in enumerate(char_list):
        if i < half_index:
            result.append(char.upper())
        else:
            result.append(char.lower())
    return chars2string(result)

In [584]:
diacritic_map = {
    'á': 'a', 'à': 'a', 'â': 'a',
    'é': 'e', 'è': 'e', 'ê': 'e',
    'í': 'i', 'ì': 'i', 'î': 'i',
    'ó': 'o', 'ò': 'o', 'ô': 'o',
    'ú': 'u', 'ù': 'u', 'û': 'u',
    'ñ': 'n',
    'Á': 'A', 'À': 'A', 'Â': 'A',
    'É': 'E', 'È': 'E', 'Ê': 'E',
    'Í': 'I', 'Ì': 'I', 'Î': 'I',
    'Ó': 'O', 'Ò': 'O', 'Ô': 'O',
    'Ú': 'U', 'Ù': 'U', 'Û': 'U',
    'Ñ': 'N',
}

reverse_diacritic_map = {
    'a': ['á', 'à', 'â',],
    'e': ['é', 'è', 'ê',],
    'i': ['í', 'ì', 'î',],
    'o': ['ó', 'ò', 'ô',],
    'u': ['ú', 'ù', 'û',],
    'n': ['ñ'],
    'A': ['Á', 'À', 'Â',],
    'E': ['É', 'È', 'Ê',],
    'I': ['Í', 'Ì', 'Î',],
    'O': ['Ó', 'Ò', 'Ô',],
    'U': ['Ú', 'Ù', 'Û',],
    'N': ['Ñ'],
}

def normalize_diacritic(string):
    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        if char in diacritic_map:
            result.append(diacritic_map[char])
        else:
            result.append(char)
    return chars2string(result)

def diacritize(string):
    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        if char in reverse_diacritic_map:
            result.append(random.choice(reverse_diacritic_map[char]))
        else:
            result.append(char)
    return chars2string(result)

def randomly_diacritize(string):
    result = [] 
    char_list = string2chars(string)
    for char in char_list:
        if char in reverse_diacritic_map and random.random() < 0.5:
            result.append(random.choice(reverse_diacritic_map[char]))
        else:
            result.append(normalize_diacritic(char))
    return chars2string(result)

In [585]:
manipulations = {
    "none": same_string,
    "deletion": delete_char,
    "insertion": insert_char,
    "substitution": substitute_char,
    "permutation": permute_char,
    "duplication": duplicate_char,
}

def get_invalid_manipulations(target_manipulation="insertion"):
    return [(name, func) for name, func in manipulations.items() if name != target_manipulation]

def apply_manipulation_incorrectly(string, target_manipulation="deletion", kwargs={}):
    if target_manipulation == "deletion" and "char_to_delete" in kwargs:
        incorrect_char = kwargs["char_to_delete"]
        remaining_chars = string.replace(incorrect_char, '')
        if remaining_chars:
            incorrect_char = get_random_char(remaining_chars)
        return delete_char(string, char_to_delete=incorrect_char)
    elif target_manipulation == "insertion" and "preceding_char" in kwargs and "char_to_insert" in kwargs:
        preceding_char = kwargs["preceding_char"]
        char_to_insert = kwargs["char_to_insert"]
        remaining_chars = 'abcdefghijklmnopqrstuvwxyz'.replace(char_to_insert, '')
        incorrect_char_to_insert = get_random_char(remaining_chars)
        return insert_char(string, preceding_char=preceding_char, char_to_insert=incorrect_char_to_insert)
    elif target_manipulation == "substitution" and "char_to_replace" in kwargs and "char_to_substitute" in kwargs:
        char_to_replace = kwargs["char_to_replace"]
        char_to_substitute = kwargs["char_to_substitute"]
        remaining_chars = 'abcdefghijklmnopqrstuvwxyz'.replace(char_to_replace, '')
        remaining_chars = remaining_chars.replace(char_to_substitute, '')
        incorrect_char_to_substitute = get_random_char(remaining_chars)
        return substitute_char(string, char_to_replace=char_to_replace, char_to_substitute=incorrect_char_to_substitute)
    elif target_manipulation == "permutation" and "char1" in kwargs and "char2" in kwargs:
        char1 = kwargs["char1"]
        char2 = kwargs["char2"]
        remaining_chars = string.replace(char1, '')
        remaining_chars = remaining_chars.replace(char2, '')
        incorrect_char2 = get_random_char(remaining_chars)
        return permute_char(string, char1=char1, char2=incorrect_char2)
    elif target_manipulation == "duplication" and "char_to_duplicate" in kwargs:
        char_to_duplicate = kwargs["char_to_duplicate"]
        remaining_chars = string.replace(char_to_duplicate, '')
        if remaining_chars:
            incorrect_char_to_duplicate = get_random_char(remaining_chars)
        else:
            incorrect_char_to_duplicate = char_to_duplicate
        return duplicate_char(string, char_to_duplicate=incorrect_char_to_duplicate)

def manipulate_string(string, target_manipulation="deletion", kwargs={}):
    manipulation_functions = get_invalid_manipulations(target_manipulation=target_manipulation)
    chosen_functions = random.sample([func for name, func in manipulation_functions], 2)
    results = [func(string) for func in chosen_functions]

    incorrect_application = apply_manipulation_incorrectly(string, target_manipulation=target_manipulation, kwargs=kwargs)
    results.append(incorrect_application)
    return results

mutations = {
    "uppercase": uppercase_char,
    "lowercase": lowercase_char,
    "randomly_uppercase": randomly_uppercase_char,
    "uppercase_first_half": uppercase_first_half_char,
}

def get_invalid_mutations(target_mutation="uppercase"):
    return [(name, func) for name, func in mutations.items() if name != target_mutation]

def mutate_string(string, target_mutation="uppercase"):
    mutation_functions = get_invalid_mutations(target_mutation=target_mutation)
    chosen_functions = random.sample([func for name, func in mutation_functions], 3)
    results = [func(string) for func in chosen_functions]
    return results

def diacritize_string(string, correct_string):
    results = [
        same_string(string),
        diacritize(string),
        randomly_diacritize(string),
    ]

    if correct_string in results:
        results.remove(correct_string)
        results.append(chars2string(shuffle_chars(string2chars(correct_string))))
    return results

### MCQ samples

#### Single-row samples

In [586]:
def spell_string(string):
    return chars2string(string2chars(string), add_space=True)

def shuffle_chars(char_list):
    if len(char_list) <= 1:
        return char_list
    shuffled_list = char_list[:]
    while True:
        random.shuffle(shuffled_list)
        if shuffled_list != char_list:
            break
    return shuffled_list

def randomly_merge_chars(char_list):
    merged_list = []
    i = 0
    while i < len(char_list):
        if i < len(char_list) - 1 and random.random() < 0.5:
            # Merge two characters
            merged_list.append(char_list[i] + char_list[i + 1])
            i += 2
        elif i < len(char_list) - 2 and random.random() < 0.5:
            # Merge three characters
            merged_list.append(char_list[i] + char_list[i + 1] + char_list[i + 2])
            i += 3
        else:
            # Keep the character as-is
            merged_list.append(char_list[i])
            i += 1

    # Ensure at least one merge if the length is unchanged
    if len(merged_list) == len(char_list):
        idx = random.randint(0, len(char_list) - 2)
        merged_list = (
            char_list[:idx]
            + [char_list[idx] + char_list[idx + 1]]
            + char_list[idx + 2:]
        )
    return merged_list

def randomly_insert_char(char_list):
    idx = random.randint(0, len(char_list))
    char_to_insert = random.choice('abcdefghijklmnopqrstuvwxyz')
    return char_list[:idx] + [char_to_insert] + char_list[idx:]

def randomly_delete_char(char_list):
    if len(char_list) <= 1:
        return char_list
    idx = random.randint(0, len(char_list) - 1)
    return char_list[:idx] + char_list[idx + 1:]


def perturb_string(string):
    char_list = string2chars(string)

    perturbation_functions = [
        shuffle_chars,
        randomly_merge_chars,
        randomly_insert_char,
        randomly_delete_char,
    ]

    chosen_functions = random.sample(perturbation_functions, 3)
    results = [chars2string(func(char_list), add_space=True) for func in chosen_functions]
    return results

def create_mcq_spelling(row):
    text_en = 'Which option spells out "{normalized_word}"?'
    text_tl = 'Alin sa sumusunod ang nagbabaybay sa "{normalized_word}"?'

    mcq_correct = spell_string(row['normalized_word'])
    mcq_incorrect = perturb_string(row['normalized_word'])
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = {
        "prompts": [{
            "text_en": text_en.format(**row),
            "text_tl": text_tl.format(**row),
            "mcq_options": mcq_options,
        }],
    }
    return outputs

create_mcq_spelling({"normalized_word": "ako"})

{'prompts': [{'text_en': 'Which option spells out "ako"?',
   'text_tl': 'Alin sa sumusunod ang nagbabaybay sa "ako"?',
   'mcq_options': {'correct': 'a k o',
    'incorrect1': 'ak o',
    'incorrect2': 'o k a',
    'incorrect3': 'a o'}}]}

In [587]:
def create_mcq_deletion(row):
    text_en = 'Which option correctly removes every "{char_to_delete}" in "{normalized_word}"?'
    text_tl = 'Alin sa sumusunod ang nagtatanggal ng bawat "{char_to_delete}" sa "{normalized_word}"?'

    string = row['normalized_word']
    char_to_delete = get_random_char(string)
    kwargs = {"char_to_delete": char_to_delete}

    mcq_correct = delete_char(string, **kwargs)
    mcq_incorrect = manipulate_string(string, target_manipulation="deletion", kwargs=kwargs)
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = {
        "prompts": [{
            "text_en": text_en.format(**row, **kwargs),
            "text_tl": text_tl.format(**row, **kwargs),
            "mcq_options": mcq_options,
        }],
    }
    return outputs

create_mcq_deletion({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly removes every "i" in "pinagkulangan"?',
   'text_tl': 'Alin sa sumusunod ang nagtatanggal ng bawat "i" sa "pinagkulangan"?',
   'mcq_options': {'correct': 'pnagkulangan',
    'incorrect1': 'pinaukglanuan',
    'incorrect2': 'pinaagkulaangaan',
    'incorrect3': 'pinagklangan'}}]}

In [588]:
def create_mcq_insertion(row):
    text_en = 'Which option correctly puts "{char_to_insert}" after every "{preceding_char}" in "{normalized_word}"?'
    text_tl = 'Alin sa sumusunod ang naglalagay ng "{char_to_insert}" pagkatapos ng bawat "{preceding_char}" sa "{normalized_word}"?'

    string = row['normalized_word']
    preceding_char = get_random_char(string)
    char_to_insert = random.choice('abcdefghijklmnopqrstuvwxyz')
    kwargs = {"preceding_char": preceding_char, "char_to_insert": char_to_insert}

    mcq_correct = insert_char(string, **kwargs)
    mcq_incorrect = manipulate_string(string, target_manipulation="insertion", kwargs=kwargs)
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_insertion({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly puts "e" after every "a" in "pinagkulangan"?',
   'text_tl': 'Alin sa sumusunod ang naglalagay ng "e" pagkatapos ng bawat "a" sa "pinagkulangan"?',
   'mcq_options': {'correct': 'pinaegkulaengaen',
    'incorrect1': 'pinaggkulanggan',
    'incorrect2': 'pinagkclangan',
    'incorrect3': 'pinabgkulabngabn'}}]}

In [589]:
def create_mcq_substitution(row):
    text_en = 'Which option correctly replaces every "{char_to_replace}" with "{char_to_substitute}" in "{normalized_word}"?'
    text_tl = 'Alin sa sumusunod ang pumapalit sa bawat "{char_to_replace}" gamit ng "{char_to_substitute}" sa "{normalized_word}"?'

    string = row['normalized_word']
    char_to_replace = get_random_char(string)
    remaining_chars = 'abcdefghijklmnopqrstuvwxyz'.replace(char_to_replace, '')
    char_to_substitute = get_random_char(remaining_chars)
    kwargs = {"char_to_replace": char_to_replace, "char_to_substitute": char_to_substitute}

    mcq_correct = substitute_char(string, **kwargs)
    mcq_incorrect = manipulate_string(string, target_manipulation="substitution", kwargs=kwargs)
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_substitution({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly replaces every "p" with "t" in "pinagkulangan"?',
   'text_tl': 'Alin sa sumusunod ang pumapalit sa bawat "p" gamit ng "t" sa "pinagkulangan"?',
   'mcq_options': {'correct': 'tinagkulangan',
    'incorrect1': 'pinagkulangan',
    'incorrect2': 'pinagkqulangan',
    'incorrect3': 'einagkulangan'}}]}

In [590]:
def create_mcq_permutation(row):
    text_en = 'Which option correctly swaps every "{char1}" with "{char2}" and vice versa in "{normalized_word}"?'
    text_tl = 'Alin sa sumusunod ang pumapalit sa bawat "{char1}" gamit ng "{char2}" at ang kabaligtarang din nito sa "{normalized_word}"?'

    string = row['normalized_word']
    char1 = get_random_char(string)
    remaining_string = string.replace(char1, '')
    char2 = get_random_char(remaining_string)
    kwargs = {"char1": char1, "char2": char2}

    mcq_correct = permute_char(string, **kwargs)
    mcq_incorrect = manipulate_string(string, target_manipulation="permutation", kwargs=kwargs)
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_permutation({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly swaps every "i" with "n" and vice versa in "pinagkulangan"?',
   'text_tl': 'Alin sa sumusunod ang pumapalit sa bawat "i" gamit ng "n" at ang kabaligtarang din nito sa "pinagkulangan"?',
   'mcq_options': {'correct': 'pniagkulaigai',
    'incorrect1': 'pinagkulangan',
    'incorrect2': 'pinagksulangan',
    'incorrect3': 'punagkilangan'}}]}

In [591]:
def create_mcq_duplication(row):
    text_en = 'Which option correctly duplicates every "{char_to_duplicate}" once in "{normalized_word}"?'
    text_tl = 'Alin sa sumusunod ang umuulit sa bawat "{char_to_duplicate}" nang isang beses sa "{normalized_word}"?'

    string = row['normalized_word']
    char_to_duplicate = get_random_char(string)
    kwargs = {"char_to_duplicate": char_to_duplicate}

    mcq_correct = duplicate_char(string, **kwargs)
    mcq_incorrect = manipulate_string(string, target_manipulation="duplication", kwargs=kwargs)
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_duplication({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly duplicates every "g" once in "pinagkulangan"?',
   'text_tl': 'Alin sa sumusunod ang umuulit sa bawat "g" nang isang beses sa "pinagkulangan"?',
   'mcq_options': {'correct': 'pinaggkulanggan',
    'incorrect1': 'pinakgulankan',
    'incorrect2': 'pingkulngn',
    'incorrect3': 'pinagkullangan'}}]}

In [592]:
def create_mcq_uppercasing(row):
    text_en = 'Which option correctly changes "{normalized_word}" to all uppercase?'
    text_tl = 'Alin sa sumusunod ang ginagawang malaki ang lahat ng titik sa "{normalized_word}"?'

    string = row['normalized_word']
    kwargs = {}

    mcq_correct = uppercase_char(string)
    mcq_incorrect = mutate_string(string, target_mutation="uppercase")
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_uppercasing({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly changes "pinagkulangan" to all uppercase?',
   'text_tl': 'Alin sa sumusunod ang ginagawang malaki ang lahat ng titik sa "pinagkulangan"?',
   'mcq_options': {'correct': 'PINAGKULANGAN',
    'incorrect1': 'pinagkulangan',
    'incorrect2': 'PINAGKulangan',
    'incorrect3': 'PinaGKUlangaN'}}]}

In [593]:
def create_mcq_lowercasing(row):
    text_en = 'Which option correctly changes "{normalized_word}" to all lowercase?'
    text_tl = 'Alin sa sumusunod ang ginagawang malaki ang lahat ng titik sa "{normalized_word}"?'

    string = row['normalized_word']
    kwargs = {}

    mcq_correct = lowercase_char(string)
    mcq_incorrect = mutate_string(string, target_mutation="lowercase")
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_lowercasing({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Which option correctly changes "pinagkulangan" to all lowercase?',
   'text_tl': 'Alin sa sumusunod ang ginagawang malaki ang lahat ng titik sa "pinagkulangan"?',
   'mcq_options': {'correct': 'pinagkulangan',
    'incorrect1': 'PINAGKulangan',
    'incorrect2': 'pinagkulanGaN',
    'incorrect3': 'PINAGKULANGAN'}}]}

In [594]:
def create_mcq_diacritic_normalization(row):
    text_en = 'Which option correctly normalizes diacritics from "{word}"?'
    text_tl = 'Alin sa sumusunod ang nagtatanggal ng mga tuldik sa "{word}"?'

    string = row['word']
    kwargs = {}

    mcq_correct = normalize_diacritic(string)
    mcq_incorrect = diacritize_string(string, mcq_correct)
    mcq_options = {
        "correct": mcq_correct,
        "incorrect1": mcq_incorrect[0],
        "incorrect2": mcq_incorrect[1],
        "incorrect3": mcq_incorrect[2],
    }

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, row=row, kwargs=kwargs)
    return outputs

create_mcq_diacritic_normalization({"word": "kakúlangan"})

{'prompts': [{'text_en': 'Which option correctly normalizes diacritics from "kakúlangan"?',
   'text_tl': 'Alin sa sumusunod ang nagtatanggal ng mga tuldik sa "kakúlangan"?',
   'mcq_options': {'correct': 'kakulangan',
    'incorrect1': 'kakúlangan',
    'incorrect2': 'kâkúlâñgâñ',
    'incorrect3': 'kàkuláñgañ'}}]}

### Multi-row samples

In [595]:
def prepare_options(words, correct_word):
    incorrect_words = [word for word in words.keys() if word != correct_word]

    mcq_options = {
        "correct": correct_word,
        "incorrect1": incorrect_words[0],
        "incorrect2": incorrect_words[1],
        "incorrect3": incorrect_words[2],
    }

    return mcq_options

def extract_character_counts(rows, target, char):
    if char is not None:
        character_counts = {}
        for row in rows:
            character_counts[row[target]] = row[target].count(char)

        return character_counts

In [596]:
# def create_mcq_character(rows):
#     text_en = 'Which option contains exactly {count} "{character}"s?'
#     text_tl = 'Alin sa sumusunod ang naglalaman ng eksaktong {count} "{character}"?'

#     word1, word2, word3, word4 = rows[0]['normalized_word'], rows[1]['normalized_word'], rows[2]['normalized_word'], rows[3]['normalized_word']
#     character_list1 = list(word1)
#     character_list2 = list(word2)
#     character_list3 = list(word3)
#     character_list4 = list(word4)
#     word1_character_count = {char: word1.count(char) for char in set(character_list1)}
#     word2_character_count = {char: word2.count(char) for char in set(character_list2)}
#     word3_character_count = {char: word3.count(char) for char in set(character_list3)}
#     word4_character_count = {char: word4.count(char) for char in set(character_list4)}

#     # Choose a random character where the words have different counts, even if some of the words have zero count
#     possible_characters = list(set(character_list1 + character_list2 + character_list3 + character_list4))
#     word_list = [word1, word2, word3, word4]
#     random.shuffle(possible_characters)
#     character = None
#     word = None
#     count = 0
#     for character in possible_characters:
#         counts = [
#             word1_character_count.get(character, 0),
#             word2_character_count.get(character, 0),
#             word3_character_count.get(character, 0),
#             word4_character_count.get(character, 0),
#         ]
        
#         unique_counts = [count for count in counts if counts.count(count) == 1]
#         random.shuffle(unique_counts)
#         if len(unique_counts) > 0:
#             count = unique_counts[0]
#             index = counts.index(count)
#             word = word_list[index]
#             count = word_list[index].count(character)
#             break
#     if word is None:
#         # print("Could not find a character with different counts in the provided words.")
#         return None
    
#     correct_word = word
#     incorrect_words = [word for word in word_list if word != correct_word]
    
#     mcq_options = {
#         "correct": correct_word,
#         "incorrect1": incorrect_words[0],
#         "incorrect2": incorrect_words[1],
#         "incorrect3": incorrect_words[2],
#     }

#     kwargs = {"character": character, "count": count}
#     outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
#     return outputs

# create_mcq_character([
#     {"normalized_word": "lolomunan"},
#     {"normalized_word": "kuhol"},
#     {"normalized_word": "bulaga"},
#     {"normalized_word": "ako"},
# ])

In [597]:
def check_if_any_character_counts_are_unique(rows, target):
    words = {}
    for row in rows:
        words[row[target]] = {
            char: row[target].count(char) for char in set(list(row[target]))
        }
    
    possible_chars = list(set().union(*[set(counts.keys()) for counts in words.values()]))
    random.shuffle(possible_chars)

    for char in possible_chars:
        char_counts = []
        for char_count in words.values():
            char_counts.append(char_count.get(char, 0))

        if len(set(char_counts)) == 4:
            return char
        
    # print("Could not find a character with different counts in the provided words.")
    return None


data = [
    {"normalized_word": "alapaap"},
    {"normalized_word": "kuhol"},
    {"normalized_word": "bulaga"},
    {"normalized_word": "ako"},
]

character = check_if_any_character_counts_are_unique(
    data, 
    target="normalized_word",
)

print("CHARACTER", character)

character_counts = extract_character_counts(
    data,
    target="normalized_word",
    char=character,
)

print("CHARACTER_COUNTS", character_counts)

CHARACTER a
CHARACTER_COUNTS {'alapaap': 4, 'kuhol': 0, 'bulaga': 2, 'ako': 1}


In [598]:
def create_mcq_char_exactly_one(rows, target, char):
    character_counts = extract_character_counts(rows, target=target, char=char)

    target_count = 1
    correct_word = [word for word, count in character_counts.items() if count == target_count][0]
    mcq_options = prepare_options(character_counts, correct_word)
    kwargs = {"target_count": target_count, "char": char}

    text_en = 'Which option contains exactly {target_count} "{char}"s?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng eksaktong {target_count} "{char}"?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

def create_mcq_char_exactly(rows, target, char):
    character_counts = extract_character_counts(rows, target=target, char=char)

    correct_word = random.choice(list(character_counts.keys()))
    target_count = character_counts[correct_word]
    mcq_options = prepare_options(character_counts, correct_word)
    kwargs = {"target_count": target_count, "char": char}

    text_en = 'Which option contains exactly {target_count} "{char}"s?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng eksaktong {target_count} "{char}"?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

def create_mcq_char_most(rows, target, char):
    character_counts = extract_character_counts(rows, target=target, char=char)

    target_count = max(character_counts.values())
    correct_word = [word for word, count in character_counts.items() if count == target_count][0]
    mcq_options = prepare_options(character_counts, correct_word)
    kwargs = {"char": char}

    text_en = 'Which option contains the most number of "{char}"?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng pinakamaraming "{char}"?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

def create_mcq_char_least(rows, target, char):
    character_counts = extract_character_counts(rows, target=target, char=char)

    target_count = min(character_counts.values())
    correct_word = [word for word, count in character_counts.items() if count == target_count][0]
    mcq_options = prepare_options(character_counts, correct_word)
    kwargs = {"char": char}

    text_en = 'Which option contains the least number of "{char}"?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng pinakakaunting "{char}"?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

In [599]:
data = [
    {"normalized_word": "alapaap"},
    {"normalized_word": "kuhol"},
    {"normalized_word": "bulaga"},
    {"normalized_word": "ako"},
]

create_mcq_char_exactly(data, target="normalized_word", char='a')
create_mcq_char_most(data, target="normalized_word", char='a')
create_mcq_char_least(data, target="normalized_word", char='a')

{'prompts': [{'text_en': 'Which option contains the least number of "a"?',
   'text_tl': 'Alin sa sumusunod ang naglalaman ng pinakakaunting "a"?',
   'mcq_options': {'correct': 'kuhol',
    'incorrect1': 'alapaap',
    'incorrect2': 'bulaga',
    'incorrect3': 'ako'}}]}

In [600]:
diacritics = set("ÀÁÂÈÉÊÌÍÎÒÓÔÙÚÛàáâèéêìíîòóôùúûÑñ")

def check_if_diacritic(char):
    return 1 if char in diacritics else 0

In [601]:
def check_if_any_diacritic_counts_are_unique(rows, target):
    words = {}
    for row in rows:
        words[row[target]] = {}
        for char in row[target]:
            if check_if_diacritic(char):
                count = row[target].count(char)
                words[row[target]][char] = count
    
    possible_chars = list(set().union(*[set(counts.keys()) for counts in words.values()]))
    random.shuffle(possible_chars)

    for char in possible_chars:
        char_counts = []
        for char_count in words.values():
            char_counts.append(char_count.get(char, 0))

        if sum(char_counts) == 1:
            return char

    return None

data = [
    {"word": "búsag"},
    {"word": "gálit"},
    # {"word": "gó"},
    {"word": "gigil"},
    {"word": "lámat"},
]

character = check_if_any_diacritic_counts_are_unique(
    data, 
    target="word",
)

print("CHARACTER", character)

character_counts = extract_character_counts(
    data,
    target="word",
    char=character,
)

print("CHARACTER_COUNTS", character_counts)

CHARACTER ú
CHARACTER_COUNTS {'búsag': 1, 'gálit': 0, 'gigil': 0, 'lámat': 0}


In [602]:
uppercase = set("ABCDEFGHIJKLMNÑOPQRSTUVWXYZ")
uppercase_diacritics = set("ÀÁÂÈÉÊÌÍÎÒÓÔÙÚÛ")

def check_if_uppercase(char):
    return 1 if (char in uppercase or char in uppercase_diacritics) else 0

def check_if_any_uppercase_counts_are_unique(rows, target):
    words = {}
    for row in rows:
        words[row[target]] = {}
        for char in row[target]:
            if check_if_uppercase(char):
                count = row[target].count(char)
                words[row[target]][char] = count
    
    possible_chars = list(set().union(*[set(counts.keys()) for counts in words.values()]))
    random.shuffle(possible_chars)

    for char in possible_chars:
        char_counts = []
        for char_count in words.values():
            char_counts.append(char_count.get(char, 0))

        if sum(char_counts) == 1:
            return char

    return None

data = [
    {"word": "Búsag"},
    {"word": "gálit"},
    {"word": "Dr. Lap"},
    {"word": "lámat"},
]

character = check_if_any_uppercase_counts_are_unique(
    data, 
    target="word",
)

print("CHARACTER", character)

character_counts = extract_character_counts(
    data,
    target="word",
    char=character,
)

print("CHARACTER_COUNTS", character_counts)

CHARACTER D
CHARACTER_COUNTS {'Búsag': 0, 'gálit': 0, 'Dr. Lap': 1, 'lámat': 0}


In [603]:
x = 'a'
print(x)
x = 'a'.capitalize()
print(x)

a
A


In [604]:
def prepare_options_capitalize(words, correct_word):
    incorrect_words = [word for word in words.keys() if word != correct_word]

    mcq_options = {
        "correct": correct_word.capitalize(),
        "incorrect1": correct_word.lower(), # One incorrect option is the lowercase version of the correct word
        "incorrect2": incorrect_words[1].capitalize(),
        "incorrect3": incorrect_words[2].capitalize(),
    }

    return mcq_options

def create_mcq_uppercase_exactly_one(rows, target, char):
    character_counts = extract_character_counts(rows, target=target, char=char)

    target_count = 1
    correct_word = [word for word, count in character_counts.items() if count == target_count][0]
    mcq_options = prepare_options_capitalize(character_counts, correct_word)
    kwargs = {"target_count": target_count, "char": char}

    text_en = 'Which option contains exactly {target_count} "{char}"s?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng eksaktong {target_count} "{char}"?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

In [605]:
def check_if_row_lengths_are_unique(rows, target):
    string1, string2, string3, string4 = rows[0][target], rows[1][target], rows[2][target], rows[3][target]
    strings = {
        string1: len(string1), string2: len(string2), string3: len(string3), string4: len(string4)
    }

    lengths = list(strings.values())
    return len(lengths) == len(set(lengths))

def extract_length(rows, target):
    word1, word2, word3, word4 = rows[0][target], rows[1][target], rows[2][target], rows[3][target]
    words = {
        word1: len(word1), word2: len(word2), word3: len(word3), word4: len(word4)
    }
    return words

In [606]:
def create_mcq_length_exactly(rows):
    words = extract_length(rows, target="normalized_word")
    
    correct_word = random.choice(list(words.keys()))
    target_length = words[correct_word]
    mcq_options = prepare_options(words, correct_word)
    kwargs = {"target_length": target_length}

    text_en = 'Which option contains exactly {target_length} characters?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng eksaktong {target_length} titik?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

def create_mcq_length_most(rows):
    words = extract_length(rows, target="normalized_word")
    
    target_length = max(words.values())
    correct_word = [word for word, length in words.items() if length == target_length][0]
    mcq_options = prepare_options(words, correct_word)
    kwargs = {}

    text_en = 'Which option contains the most number of characters?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng pinakamaraming titik?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

def create_mcq_length_least(rows):
    words = extract_length(rows, target="normalized_word")
    
    target_length = min(words.values())
    correct_word = [word for word, length in words.items() if length == target_length][0]
    mcq_options = prepare_options(words, correct_word)
    kwargs = {}

    text_en = 'Which option contains the least number of characters?'
    text_tl = 'Alin sa sumusunod ang naglalaman ng pinakakaunting titik?'

    outputs = prepare_mcq_outputs(text_en, text_tl, mcq_options, kwargs=kwargs)
    
    return outputs

In [607]:
create_mcq_length_exactly([
    {"normalized_word": "lolomunan"},
    {"normalized_word": "kuhol"},
    {"normalized_word": "bulaga"},
    {"normalized_word": "ako"},
])

{'prompts': [{'text_en': 'Which option contains exactly 6 characters?',
   'text_tl': 'Alin sa sumusunod ang naglalaman ng eksaktong 6 titik?',
   'mcq_options': {'correct': 'bulaga',
    'incorrect1': 'lolomunan',
    'incorrect2': 'kuhol',
    'incorrect3': 'ako'}}]}

### GEN samples

In [608]:
def prepare_gen_outputs(text_en, text_tl, label, row={}, kwargs={}):
    outputs = {
        "prompts": [{
            "text_en": text_en.format(**row, **kwargs),
            "text_tl": text_tl.format(**row, **kwargs),
        }],
        "label": label
    }
    return outputs

In [None]:
def create_gen_spelling(row):
    text_en = 'Spell out the word "{normalized_word}".'
    text_tl = 'Baybayin ang salitang "{normalized_word}".'

    spelling = string2chars(row['normalized_word'])
    label = chars2string(spelling, add_space=True)
    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row)
    return outputs

create_gen_spelling({"normalized_word": "ako"})

{'prompts': [{'text_en': 'Spell out the word "ako".',
   'text_tl': 'Baybayin ang salitang "ako".'}],
 'label': 'a k o'}

In [None]:
def create_gen_character(row):
    text_en = 'How many "{character}"s are in "{normalized_word}"?'
    text_tl = 'Ilang "{character}" ang mayroon sa "{normalized_word}".'

    character_list = string2chars(row['normalized_word'])
    character_counts = {char: character_list.count(char) for char in set(character_list)}
    random_character = random.choice(list(character_counts.keys()))
    label = character_counts[random_character]
    kwargs = {"character": random_character}

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_character({"normalized_word": "kakulangan"})

{'prompts': [{'text_en': 'How many "k"s are in "kakulangan"?',
   'text_tl': 'Ilang "k" ang mayroon sa "kakulangan".'}],
 'label': 2}

In [None]:
def create_gen_diacritic(row):
    text_en = 'How many diacritics are in "{word}"?'
    text_tl = 'Ilang titik ang mayroong tuldik sa "{word}".'

    character_list = string2chars(row['word'])
    diacritic_counts = {char: character_list.count(char) for char in set(character_list) if check_if_diacritic(char)}
    label = sum(diacritic_counts.values()) if diacritic_counts else 0
    kwargs = {}

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_diacritic({"word": "espásiyo"})

{'prompts': [{'text_en': 'How many diacritics are in "espásiyo"?',
   'text_tl': 'Ilang titik ang mayroong tuldik sa "espásiyo".'}],
 'label': 1}

In [None]:
def create_gen_uppercase(row):
    text_en = 'How many uppercase characters are in "{normalized_word}"?'
    text_tl = 'Ilang malaking titik ang mayroon sa "{normalized_word}".'

    character_list = string2chars(row['normalized_word'])
    uppercase_counts = {char: character_list.count(char) for char in set(character_list) if char.isupper()}
    label = sum(uppercase_counts.values()) if uppercase_counts else 0
    kwargs = {}

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_uppercase({"normalized_word": "Español"})

{'prompts': [{'text_en': 'How many uppercase characters are in "Español"?',
   'text_tl': 'Ilang malaking titik ang mayroon sa "Español".'}],
 'label': 1}

In [None]:
def create_gen_lowercase(row):
    text_en = 'How many lowercase characters are in "{normalized_word}"?'
    text_tl = 'Ilang maliliit na titik ang mayroon sa "{normalized_word}".'

    character_list = string2chars(row['normalized_word'])
    lowercase_counts = {char: character_list.count(char) for char in set(character_list) if char.islower()}
    label = sum(lowercase_counts.values()) if lowercase_counts else 0
    kwargs = {}

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_lowercase({"normalized_word": "Español"})

{'prompts': [{'text_en': 'How many lowercase characters are in "Español"?',
   'text_tl': 'Ilang maliliit na titik ang mayroon sa "Español".'}],
 'label': 6}

In [None]:
def create_gen_length(row):
    text_en = 'How many characters are in the "{normalized_word}"?'
    text_tl = 'Ilan ang titik sa "{normalized_word}"?'

    label = len(row['normalized_word'])
    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row)
    return outputs

create_gen_length({"normalized_word": "ako"})

{'prompts': [{'text_en': 'How many characters are in the "ako"?',
   'text_tl': 'Ilan ang titik sa "ako"?'}],
 'label': 3}

In [None]:
def create_gen_deletion(row):
    text_en = 'Remove every "{char_to_delete}" in "{normalized_word}".'
    text_tl = 'Tanggalin ang bawat "{char_to_delete}" sa "{normalized_word}".'

    string = row['normalized_word']
    char_to_delete = get_random_char(string)
    kwargs = {"char_to_delete": char_to_delete}
    label = delete_char(string, **kwargs)

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_deletion({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Remove every "a" in "pinagkulangan".',
   'text_tl': 'Tanggalin ang bawat "a" sa "pinagkulangan".'}],
 'label': 'pingkulngn'}

In [None]:
def create_gen_insertion(row):
    text_en = 'Put a "{char_to_insert}" after every "{preceding_char}" in "{normalized_word}"'
    text_tl = 'Maglagay ng "{char_to_insert}" pagkatapos ng bawat "{preceding_char}" sa "{normalized_word}"'

    string = row['normalized_word']
    preceding_char = get_random_char(string)
    char_to_insert = random.choice('abcdefghijklmnopqrstuvwxyz')
    kwargs = {"preceding_char": preceding_char, "char_to_insert": char_to_insert}
    label = insert_char(string, **kwargs)

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_insertion({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Put a "u" after every "u" in "pinagkulangan"',
   'text_tl': 'Maglagay ng "u" pagkatapos ng bawat "u" sa "pinagkulangan"'}],
 'label': 'pinagkuulangan'}

In [None]:
def create_gen_substitution(row):
    text_en = 'Replace every "{char_to_replace}" with "{char_to_substitute}" in "{normalized_word}".'
    text_tl = 'Palitan ang bawat "{char_to_replace}" gamit ng "{char_to_substitute}" sa "{normalized_word}".'

    string = row['normalized_word']
    char_to_replace = get_random_char(string)
    remaining_chars = 'abcdefghijklmnopqrstuvwxyz'.replace(char_to_replace, '')
    char_to_substitute = get_random_char(remaining_chars)
    kwargs = {"char_to_replace": char_to_replace, "char_to_substitute": char_to_substitute}
    label = substitute_char(string, **kwargs)

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_substitution({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Replace every "n" with "x" in "pinagkulangan".',
   'text_tl': 'Palitan ang bawat "n" gamit ng "x" sa "pinagkulangan".'}],
 'label': 'pixagkulaxgax'}

In [None]:
def create_gen_permutation(row):
    text_en = 'Swap every "{char1}" with "{char2}" in "{normalized_word}".'
    text_tl = 'Palitan ang bawat "{char1}" gamit ng "{char2}" at ang kabaligtarang din nito sa "{normalized_word}".'

    string = row['normalized_word']
    char1 = get_random_char(string)
    remaining_string = string.replace(char1, '')
    char2 = get_random_char(remaining_string)
    kwargs = {"char1": char1, "char2": char2}
    label = permute_char(string, **kwargs)

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_permutation({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Swap every "n" with "p" in "pinagkulangan".',
   'text_tl': 'Palitan ang bawat "n" gamit ng "p" at ang kabaligtarang din nito sa "pinagkulangan".'}],
 'label': 'nipagkulapgap'}

In [None]:
def create_gen_duplication(row):
    text_en = 'Duplicate every "{char_to_duplicate}" once in "{normalized_word}".'
    text_tl = 'Ulitin ang bawat "{char_to_duplicate}" nang isang beses sa "{normalized_word}".'

    string = row['normalized_word']
    char_to_duplicate = get_random_char(string)
    kwargs = {"char_to_duplicate": char_to_duplicate}
    label = duplicate_char(string, **kwargs)

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs=kwargs)
    return outputs

create_gen_duplication({"normalized_word": "pinagkulangan"})

{'prompts': [{'text_en': 'Duplicate every "g" once in "pinagkulangan".',
   'text_tl': 'Ulitin ang bawat "g" nang isang beses sa "pinagkulangan".'}],
 'label': 'pinaggkulanggan'}

In [None]:
def create_gen_uppercasing(row):
    text_en = 'Change "{normalized_word}" into uppercase.'
    text_tl = 'Gawing malaki ang lahat ng titik sa "{normalized_word}".'
    
    label = row["normalized_word"].upper()

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs={})
    return outputs

create_gen_uppercasing({"normalized_word": "Español"})

{'prompts': [{'text_en': 'Change "Español" into uppercase.',
   'text_tl': 'Gawing malaki ang lahat ng titik sa "Español".'}],
 'label': 'ESPAÑOL'}

In [None]:
def create_gen_lowercasing(row):
    text_en = 'Change "{normalized_word}" into lowercase.'
    text_tl = 'Gawing maliit ang lahat ng titik sa "{normalized_word}".'

    label = row["normalized_word"].lower()

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs={})
    return outputs

create_gen_lowercasing({"normalized_word": "Español"})

{'prompts': [{'text_en': 'Change "Español" into lowercase.',
   'text_tl': 'Gawing maliit ang lahat ng titik sa "Español".'}],
 'label': 'español'}

In [None]:
def create_gen_diacritic_normalization(row):
    text_en = 'Normalize the diacritics from "{word}".'
    text_tl = 'Tanggalin ang lahat ng mga tuldik sa "{word}".'
    
    label = normalize_diacritic(row["word"])

    outputs = prepare_gen_outputs(text_en, text_tl, str(label), row=row, kwargs={})
    return outputs

create_gen_diacritic_normalization({"word": "Español"})

{'prompts': [{'text_en': 'Normalize the diacritics from "Español".',
   'text_tl': 'Tanggalin ang lahat ng mga tuldik sa "Español".'}],
 'label': 'Espanol'}

## Create data points

In [623]:
random.seed(100)
num_samples = 20
syllables = pd.read_json("data/syllables.jsonl", lines=True)

In [624]:
tasks = {
    "mcq": {
        "composition": {
            "spelling": create_mcq_spelling,
            "char_exactly": create_mcq_char_exactly,
            "char_least": create_mcq_char_least,
            "char_most": create_mcq_char_most,
            "diacritic_exactly": create_mcq_char_exactly_one,
            "uppercase_exactly": create_mcq_uppercase_exactly_one,
            "length_exactly": create_mcq_length_exactly,
            "length_least": create_mcq_length_least,
            "length_most": create_mcq_length_most,
        },
        "manipulation": {
            "insertion": create_mcq_insertion,
            "deletion": create_mcq_deletion,
            "substitution": create_mcq_substitution,
            "permutation": create_mcq_permutation,
            "duplication": create_mcq_duplication,
            "uppercasing": create_mcq_uppercasing,
            "lowercasing": create_mcq_lowercasing,
            "diacritic_normalization": create_mcq_diacritic_normalization,
        }
    },
    "gen": {
        "composition": {
            "spelling": create_gen_spelling,
            "character": create_gen_character,
            "diacritic": create_gen_diacritic,
            "uppercase": create_gen_uppercase,
            "length": create_gen_length,
        },
        "manipulation": {
            "insertion": create_gen_insertion,
            "deletion": create_gen_deletion,
            "substitution": create_gen_substitution,
            "permutation": create_gen_permutation,
            "duplication": create_gen_duplication,
            "uppercasing": create_gen_uppercasing,
            "lowercasing": create_gen_lowercasing,
            "diacritic_normalization": create_gen_diacritic_normalization,
        },
    }
}

int2label = {0: "A", 1: "B", 2: "C", 3: "D"}

In [625]:
def create_mcq_multiple_row_dataset(dataset, category_name, subcategory_name, subcategory_function, num_samples):
    processed_data = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    shuffled_dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
    for _, rows in shuffled_dataset.groupby(lambda x: x // 4):
        if len(processed_data) >= num_samples:
            break

        samples = rows.to_dict(orient="records")
        mcq_row = None
        if "length" in subcategory_name:
            valid_length = check_if_row_lengths_are_unique(samples, target="normalized_word")
            if valid_length:
                mcq_row = subcategory_function(samples)
        elif "char" in subcategory_name:
            valid_char = check_if_any_character_counts_are_unique(samples, target="normalized_word")
            if valid_char is not None:
                mcq_row = subcategory_function(samples, target="normalized_word", char=valid_char)
        elif "diacritic_exactly" in subcategory_name:
            valid_diacritic = check_if_any_diacritic_counts_are_unique(samples, target="word")
            if valid_diacritic:
                mcq_row = subcategory_function(samples, target="word", char=valid_diacritic)
        elif "diacritic_exactly" in subcategory_name:
            valid_uppercase = check_if_any_uppercase_counts_are_unique(samples, target="word")
            if valid_uppercase:
                mcq_row = subcategory_function(samples, target="word", char=valid_uppercase)

        if mcq_row is not None:
            processed_data = pd.concat([processed_data, pd.DataFrame({
                "category": [category_name],
                "subcategory": [subcategory_name],
                "prompts": [mcq_row["prompts"]],
            })], ignore_index=True)
    
    return processed_data

In [626]:
def create_mcq_single_row_dataset(dataset, category_name, subcategory_name, subcategory_function, num_samples):
    processed_data = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    for _, row in dataset.sample(num_samples).iterrows():
        mcq_row = subcategory_function(row)
        processed_data = pd.concat([processed_data, pd.DataFrame({
            "category": [category_name],
            "subcategory": [subcategory_name],
            "prompts": [mcq_row["prompts"]],
        })], ignore_index=True)
    
    return processed_data

In [627]:
def create_mcq_dataset(dataset, num_samples):
    mcq_dataset = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    for category_name, category_value in tasks['mcq'].items():
        mcq_dataset_cat = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
        for subcategory_name, subcategory_function in category_value.items():
            mcq_dataset_subcat = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
            if 'char' in subcategory_name or 'length' in subcategory_name:
                mcq_dataset_subcat = create_mcq_multiple_row_dataset(dataset, category_name, subcategory_name, subcategory_function, num_samples)
            elif 'diacritic_exactly' in subcategory_name or 'uppercase_exactly' in subcategory_name:
                mcq_dataset_subcat = create_mcq_multiple_row_dataset(dataset, category_name, subcategory_name, subcategory_function, num_samples)
            else:
                mcq_dataset_subcat = create_mcq_single_row_dataset(dataset, category_name, subcategory_name, subcategory_function, num_samples)

            # Recreate options so that they are stratified/distributed evenly based on the number of samples
            # This is to avoid having too many correct options being the same choice (e.g., choice1)
            # {"label": "A/B/C/D", "prompts": [{"text_en": "...", "mcq_option": {...}, "choice1": "...", ... "choice4": "..."}]}
            for i in range(len(mcq_dataset_subcat)):
                label_index = i % 4
                correct = mcq_dataset_subcat.iloc[i]['prompts'][0]["mcq_options"]['correct']
                options = [
                    mcq_dataset_subcat.iloc[i]['prompts'][0]["mcq_options"]['incorrect1'],
                    mcq_dataset_subcat.iloc[i]['prompts'][0]["mcq_options"]['incorrect2'],
                    mcq_dataset_subcat.iloc[i]['prompts'][0]["mcq_options"]['incorrect3'],
                ]
                random.shuffle(options)

                options.insert(label_index, correct)
                choices = {
                    "choice1": options[0],
                    "choice2": options[1],
                    "choice3": options[2],
                    "choice4": options[3],
                }
                label = int2label[label_index]
                mcq_dataset_subcat.at[i, 'prompts'][0].update(choices)
                mcq_dataset_subcat.at[i, 'label'] = label

            mcq_dataset_cat = pd.concat([mcq_dataset_cat, mcq_dataset_subcat], ignore_index=True)
        mcq_dataset  = pd.concat([mcq_dataset, mcq_dataset_cat], ignore_index=True)
    return mcq_dataset

In [638]:
def create_gen_dataset_samples(dataset, category_name, subcategory_name, subcategory_function, num_samples, filter_function=None):
    processed_data = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    for _, row in dataset.sample(num_samples * 100).iterrows():
        if len(processed_data) >= num_samples:
            break

        if filter_function is not None:
            if not filter_function(row):
                continue
        gen_row = subcategory_function(row)
        processed_data = pd.concat([processed_data, pd.DataFrame({
            "category": [category_name],
            "subcategory": [subcategory_name],
            "prompts": [gen_row["prompts"]],
            "label": [gen_row["label"]],
        })], ignore_index=True)
    return processed_data

In [639]:
def create_gen_dataset(dataset, num_samples): 
    gen_dataset = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
    for category_name, category_value in tasks['gen'].items():
        gen_dataset_cat = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])
        for subcategory_name, subcategory_function in category_value.items():
            gen_dataset_subcat = pd.DataFrame(columns=["category", "subcategory", "prompts", "label"])

            if "diacritic" in subcategory_name:
                gen_dataset_subcat = create_gen_dataset_samples(
                    dataset, 
                    category_name, 
                    subcategory_name, 
                    subcategory_function, 
                    num_samples, 
                    filter_function=lambda row: any(check_if_diacritic(char) for char in row['word'])
                )
            elif "uppercase" in subcategory_name:
                gen_dataset_subcat = create_gen_dataset_samples(
                    dataset, 
                    category_name, 
                    subcategory_name, 
                    subcategory_function, 
                    num_samples, 
                    filter_function=lambda row: any(char.isupper() for char in row['normalized_word'])
                )
            elif "lowercasing" in subcategory_name:
                gen_dataset_subcat = create_gen_dataset_samples(
                    dataset, 
                    category_name, 
                    subcategory_name, 
                    subcategory_function, 
                    num_samples, 
                    filter_function=lambda row: any(char.isupper() for char in row['normalized_word'])
                )
            elif "uppercasing" in subcategory_name:
                gen_dataset_subcat = create_gen_dataset_samples(
                    dataset, 
                    category_name, 
                    subcategory_name, 
                    subcategory_function, 
                    num_samples, 
                    filter_function=lambda row: any(char.islower() for char in row['normalized_word'])
                )
            else:
                gen_dataset_subcat = create_gen_dataset_samples(dataset, category_name, subcategory_name, subcategory_function, num_samples)

            gen_dataset_cat = pd.concat([gen_dataset_cat, gen_dataset_subcat], ignore_index=True)
        gen_dataset  = pd.concat([gen_dataset, gen_dataset_cat], ignore_index=True)
    return gen_dataset

In [640]:
mcq_dataset = create_mcq_dataset(syllables, num_samples)
mcq_dataset.to_json("data/mcq_composition_manipulation_dataset.jsonl", lines=True, orient="records", force_ascii=False)
print(mcq_dataset['subcategory'].value_counts())
mcq_dataset.tail()

subcategory
spelling                   20
char_exactly               20
char_least                 20
char_most                  20
diacritic_exactly          20
length_exactly             20
length_least               20
length_most                20
insertion                  20
deletion                   20
substitution               20
permutation                20
duplication                20
uppercasing                20
lowercasing                20
diacritic_normalization    20
Name: count, dtype: int64


Unnamed: 0,category,subcategory,prompts,label
315,manipulation,diacritic_normalization,[{'text_en': 'Which option correctly normalize...,D
316,manipulation,diacritic_normalization,[{'text_en': 'Which option correctly normalize...,A
317,manipulation,diacritic_normalization,[{'text_en': 'Which option correctly normalize...,B
318,manipulation,diacritic_normalization,[{'text_en': 'Which option correctly normalize...,C
319,manipulation,diacritic_normalization,[{'text_en': 'Which option correctly normalize...,D


In [641]:
i = 123
print(mcq_dataset.iloc[i]['label'])
mcq_dataset.iloc[i]['prompts'][0]

D


{'text_en': 'Which option contains the least number of characters?',
 'text_tl': 'Alin sa sumusunod ang naglalaman ng pinakakaunting titik?',
 'mcq_options': {'correct': 'lunan',
  'incorrect1': 'makyat',
  'incorrect2': 'kalimbahin',
  'incorrect3': 'talibugso'},
 'choice1': 'talibugso',
 'choice2': 'kalimbahin',
 'choice3': 'makyat',
 'choice4': 'lunan'}

In [642]:
gen_dataset = create_gen_dataset(syllables, num_samples)
gen_dataset.to_json("data/gen_composition_manipulation_dataset.jsonl", lines=True, orient="records", force_ascii=False)
print(gen_dataset['subcategory'].value_counts())
gen_dataset.tail()

subcategory
spelling                   20
character                  20
diacritic                  20
uppercase                  20
length                     20
insertion                  20
deletion                   20
substitution               20
permutation                20
duplication                20
uppercasing                20
lowercasing                20
diacritic_normalization    20
Name: count, dtype: int64


Unnamed: 0,category,subcategory,prompts,label
255,manipulation,diacritic_normalization,"[{'text_en': 'Normalize the diacritics from ""d...",dayag
256,manipulation,diacritic_normalization,"[{'text_en': 'Normalize the diacritics from ""b...",bayubay
257,manipulation,diacritic_normalization,"[{'text_en': 'Normalize the diacritics from ""l...",liki
258,manipulation,diacritic_normalization,"[{'text_en': 'Normalize the diacritics from ""i...",ingolot
259,manipulation,diacritic_normalization,"[{'text_en': 'Normalize the diacritics from ""s...",sapola
