# HomeAssistant BERT Training Data generation

This notebook is used to generate data to train the BERT model for using sentences in Catalan.

It is based in the existing intent definition in catalan in:
https://github.com/home-assistant/intents/tree/main/sentences/ca

Data from that repository is not to train a BERT system but for using it as a phrase structure to interpret the senteces to generate intents.

In this notebook, we will expand those phrases to be able to use them to train a BERT system.


## Install required dependencies

In [26]:
%pip install pyyaml pandas

Note: you may need to restart the kernel to use updated packages.


## Import required libs

In [27]:
import yaml
import os
import re
import itertools
import pandas as pd

In [79]:
def load_expansion_rules(common_file_path):
    """
    Load expansion rules from the _common.yaml file.
    """
    print(f"Loading expansion rules from {common_file_path}")
    with open(common_file_path, 'r', encoding='utf-8') as f:
        content = yaml.safe_load(f)
    return content.get('expansion_rules', {})

def expand_rules(sentence, expansion_rules):
    """
    Expand rules in the sentence using the provided expansion rules.
    """
    while '<' in sentence and '>' in sentence:
        match = re.search(r'<(.*?)>', sentence)
        if not match:
            break
        rule_name = match.group(1)
        rule_expansion = expansion_rules.get(rule_name, f"<{rule_name}>")
        print(f"Expanding rule: {rule_name} -> {rule_expansion}")
        old_sentence = sentence
        sentence = sentence.replace(f"<{rule_name}>", rule_expansion, 1)
        if sentence == old_sentence:
            print(f"Warning: No expansion found for {rule_name}. Keeping original.")
            break
    return sentence

def expand_sentence(sentence, expansion_rules):
    sentences = [sentence]
    outsentences=[]
    for sentence in sentences:
        outsentences.append(expand_rules(sentence, expansion_rules))

    sentences = outsentences
    
    expanded=True
    while expanded:
        print("sentences:",sentences)
        expanded=False
        outsentences = []
        for sentence in sentences:
            expanded_sentences,expanded_inner=expand_blocks(sentence)
            if expanded_inner:
                expanded=True
            for expanded_sentence in expanded_sentences:
                outsentences.append(expanded_sentence)
            print("outsentences:",outsentences)
        #remove duplicates
        for i in range(len(outsentences)):
            for j in range(i+1, len(outsentences)):
                if outsentences[i] == outsentences[j]:
                    outsentences.pop(j)
                    break
        
        sentences = outsentences

    return sentences


def expand_sentence_x(sentence, expansion_rules):
    """
    Expand phrases between [], (), and <> and maintain entities {name}.
    Handles nested expandable blocks in a single sentence.
    """
    # Primer, expandim els blocs entre parèntesis com un nivell superior
    if '(' in sentence and ')' in sentence:
        parts = re.split(r'(\(.*?\))', sentence)
        expanded_sentences = []
        
        for part in parts:
            if part.startswith('(') and part.endswith(')'):
                options = part[1:-1].split('|')
                if not expanded_sentences:
                    expanded_sentences = options
                else:
                    expanded_sentences = [
                        f"{prev}{opt}" for prev in expanded_sentences for opt in options
                    ]
            else:
                if not expanded_sentences:
                    expanded_sentences = [part]
                else:
                    expanded_sentences = [f"{prev}{part}" for prev in expanded_sentences]
    else:
        expanded_sentences = [sentence]

    # Ara, expandim els blocs entre claudàtors dins de cada frase generada
    final_sentences = []
    for expanded in expanded_sentences:
        parts = re.split(r'(\[.*?\])', expanded)
        tokens = []

        for part in parts:
            if part.startswith('[') and part.endswith(']'):
                options = part[1:-1].split('|')
                tokens.append(options)
            else:
                tokens.append([part])

        combinations = list(itertools.product(*tokens))
        final_sentences.extend([''.join(combo).strip() for combo in combinations])

    #return final_sentences

    # Finalment, expandim els blocs entre <rule> utilitzant les expansion_rules
    fully_expanded_sentences = []
    for sentence in final_sentences:
        while '<' in sentence and '>' in sentence:
            match = re.search(r'<(.*?)>', sentence)
            if not match:
                break
            rule_name = match.group(1)
            rule_expansion = expansion_rules.get(rule_name, f"<{rule_name}>")
            print(f"Expanding rule: {rule_name} -> {rule_expansion}")
            old_sentence = sentence
            sentence = sentence.replace(f"<{rule_name}>", rule_expansion, 1)
            if sentence == old_sentence:
                print(f"Warning: No expansion found for {rule_name}. Keeping original.")
                break
        fully_expanded_sentences.append(sentence)

    return fully_expanded_sentences

def load_sentences_from_yaml(file_path, expansion_rules):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = yaml.safe_load(f)

    data = []
    # Navigate through the YAML structure
    for intent_name, intent_data in content.get('intents', {}).items():
        for item in intent_data.get('data', []):
            sentences = item.get('sentences', [])
            for sentence in sentences:
                expanded = expand_sentence(sentence,expansion_rules)
                for s in expanded:
                    data.append({'sentence': s, 'intent': intent_name})
    return data

def process_directory(yaml_dir):
    all_data = []
    # Process general YAML files
    common_file_path = os.path.join(yaml_dir, "_common.yaml")
    expansion_rules = load_expansion_rules(common_file_path)

    # Process each YAML file in the directory
    for file_name in os.listdir(yaml_dir):
        print(file_name)
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            path = os.path.join(yaml_dir, file_name)
            all_data.extend(load_sentences_from_yaml(path,expansion_rules))
    return all_data


def expand_blocks(sentence):
    """
    Expand blocks in the sentence between the specified initial and end characters.
    """
    initial_chars = ['(','[']
    end_chars = [')',']']
    #print(sentence)
    expanded_sentences = []
    expanded=False
    #if sentence contains any of the initial characters and end characters
    if any(char in sentence for char in initial_chars) and any(char in sentence for char in end_chars):
        end_char_pos_found= False
        initial_char_pos2_found = False
        for initial_char_pos in range(len(sentence)):
            if sentence[initial_char_pos] in initial_chars:
                break;
        for end_char_pos in range(initial_char_pos+1, len(sentence)):
            if sentence[end_char_pos] in end_chars:
                end_char_pos_found = True
                break;

        for initial_char_pos2 in range(initial_char_pos+1, len(sentence)):
            if sentence[initial_char_pos2] in initial_chars:
                initial_char_pos2_found = True
                break;
        
        #print("initial_char_pos:",initial_char_pos)
        #print("end_char_pos:",end_char_pos)
        #print("initial_char_pos2:",initial_char_pos2)

        if end_char_pos_found and initial_char_pos2_found and initial_char_pos2 < end_char_pos:
            #execute the expansion recursive between the initial2 and end characters                       
            generatedsubstrings,expanded = expand_blocks(sentence[initial_char_pos2:end_char_pos+1])
            #print("generatedsubstrings:",generatedsubstrings)
            for generatedsubstring in generatedsubstrings:
                #print("generatedsubstring:",generatedsubstring)
                expanded_sentences.append(sentence[:initial_char_pos2] + generatedsubstring + sentence[end_char_pos+1:])
        else:
            #expand the sentence between the initial and end characters generate as may sentences as values separeted by |
            
            options = sentence[initial_char_pos+1:end_char_pos].split('|')
            #print("options:",options)
            for option in options:
                #print("option:",option)
                expanded_sentences.append(sentence[:initial_char_pos] + option + sentence[end_char_pos+1:])
                expanded=True

            #print("expanded_sentences1:",expanded_sentences)
        
    else:
        expanded_sentences = [sentence]
    print("expanded_sentences2:",expanded_sentences)
    #print("expanded:",expanded)
    
    return expanded_sentences, expanded


if __name__ == "__main__":
    #yaml_directory = r".\intents\sentences\ca"
    yaml_directory = r".\test_ca"
    output_csv = "hass_intents_ca.csv"

    data = process_directory(yaml_directory)
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)
    print(f"Dataset generat amb {len(df)} frases i desat a: {output_csv}")

Loading expansion rules from .\test_ca\_common.yaml
test1.yaml
sentences: ['Inici frase [[Hola|Hi]|Adeu] fi frase']
expanded_sentences2: ['Hola', 'Hi']
expanded_sentences2: ['Inici frase [Hola|Adeu] fi frase', 'Inici frase [Hi|Adeu] fi frase']
outsentences: ['Inici frase [Hola|Adeu] fi frase', 'Inici frase [Hi|Adeu] fi frase']
sentences: ['Inici frase [Hola|Adeu] fi frase', 'Inici frase [Hi|Adeu] fi frase']
expanded_sentences2: ['Inici frase Hola fi frase', 'Inici frase Adeu fi frase']
outsentences: ['Inici frase Hola fi frase', 'Inici frase Adeu fi frase']
expanded_sentences2: ['Inici frase Hi fi frase', 'Inici frase Adeu fi frase']
outsentences: ['Inici frase Hola fi frase', 'Inici frase Adeu fi frase', 'Inici frase Hi fi frase', 'Inici frase Adeu fi frase']
sentences: ['Inici frase Hola fi frase', 'Inici frase Adeu fi frase', 'Inici frase Hi fi frase']
expanded_sentences2: ['Inici frase Hola fi frase']
outsentences: ['Inici frase Hola fi frase']
expanded_sentences2: ['Inici frase Ad