# NLU data reader

In [1]:
pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import re
from typing import List, Dict, Tuple

import pandas as pd
import yaml

NORMAL_REGEX = "\[[^\[\]]+\]\([\w+\s*]+\)"
ENTITY_REGEX = "\[[^\[\]]+\]"
ENTITY_NAME_REGEX = "\([\w+\s*]+\)"
SYNONYM_REGEX = "\[[^\[\]]+\]\{[^{}]+\}"
DATA_REGEX = "\{[^{}]+\}"

In [3]:
def get_entity_new(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, set]]:

  entities_list = {}
  for _, row in df.iterrows():
    entity_data = row["entities"]
    if not entity_data:
      entity_data = []

    if isinstance(entity_data, str):
      try:
        entity_data = json.loads(entity_data)
      except Exception as ex:
        raise RuntimeError(f"Cannot convert entity_data to json: {entity_data}")
      
    example = row["example"]
    while True:
      # search for only-object annotation
      x = re.search(NORMAL_REGEX, example)
      if x is None:
        break

      start, end = x.span()
      entity = x.group()

      entity_text = re.search(ENTITY_REGEX, entity).group()[1:-1]
      entity_name_text = re.search(ENTITY_NAME_REGEX, entity).group()[1:-1]

      if entity_name_text not in entities_list:
        entities_list[entity_name_text] = set()
      entities_list[entity_name_text].add("O")

      example = example.replace(entity, entity_text)

      entity_data.append(dict(
          entity=entity_text,
          entity_name=entity_name_text,
          position=(start, end - (len(entity) - len(entity_text)))
      ))

    # row["entities"] = entity_data

    while True:
      #search for detailed annotation
      x = re.search(SYNONYM_REGEX, example)
      if x is None:
        break

      start, end = x.span()
      entity = x.group()

      entity_text = re.search(ENTITY_REGEX, entity).group()[1:-1]
      detailed_text = re.search(DATA_REGEX, entity).group()

      try:
        detailed_data = json.loads(detailed_text)
      except Exception as ex:
        raise ValueError(f"Synonym json is incorrect: {detailed_text}")

      entity_name_text = detailed_data.get("entity", None)
      role_value = detailed_data.get("role", None)

      if entity_name_text is None:
        a = row["intent"]
        b = detailed_data
        raise ValueError(f"this annotation is wrong: {a}; {b}")

      example = example.replace(entity, entity_text)

      if role_value is None:
        if entity_name_text not in entities_list:
          entities_list[entity_name_text] = set()
        entities_list[entity_name_text].add("O")
      else:
        if entity_name_text not in entities_list:
          entities_list[entity_name_text] = set()
        entities_list[entity_name_text].add(role_value)

        entity_name_text = entity_name_text + "|" + role_value

      entity_data.append(dict(
          entity=entity_text,
          entity_name=entity_name_text,
          position=(start, end - (len(entity) - len(entity_text)))
      ))

    row["example"] = example
    row["entities"] = entity_data

  return df, entities_list

In [4]:
def read_from_yaml(file: str) -> List[Dict[str, str]]:
  """
  params:
  file: file location

  return: list(dict(text, any))
  """
  try:
    f = open(file, "r")
  except Exception as ex:
    raise RuntimeError(f"Cannot read file {file} with error:\t{ex}")

  data = yaml.load(f, Loader=yaml.FullLoader)["nlu"]
  return data

def get_entity(df: pd.DataFrame) -> pd.DataFrame:

  for _, row in df.iterrows():
    entity_data = row["entities"]
    if not entity_data:
      entity_data = []

    while True:
      example = row["example"]
      x = re.search(NORMAL_REGEX, example)
      if x is None:
        break

      start, end = x.span()
      entity = x.group()

      entity_text = re.search(ENTITY_REGEX, entity).group()[1:-1]
      entity_name_text = re.search(ENTITY_NAME_REGEX, entity).group()[1:-1]

      row["example"] = example.replace(entity, entity_text)

      entity_data.append(dict(
          entity=entity_text,
          entity_name=entity_name_text,
          position=(start, end - (len(entity) - len(entity_text)))
      ))

    row["entities"] = entity_data

  return df

def get_entity_with_role(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:

  for _, row in df.iterrows():
    entity_data = row["entities"]
    if not entity_data:
      entity_data = []

    if isinstance(entity_data, str):
      try:
        entity_data = json.loads(entity_data)
      except Exception as ex:
        raise RuntimeError(f"Cannot convert entity_data to json: {entity_data}")

    while True:
      example = row["example"]
      x = re.search(SYNONYM_REGEX, example)
      if x is None:
        break

      start, end = x.span()
      entity = x.group()

      entity_text = re.search(ENTITY_REGEX, entity).group()[1:-1]
      role_text = re.search(DATA_REGEX, entity).group()

      try:
        role_data = json.loads(role_text)
      except Exception as ex:
        raise ValueError(f"role json is incorrect: {role_text}")

      entity_name_text = role_data.get("entity", None)
      role_value = role_data.get("role", None)

      if entity_name_text is None or role_value is None:
        raise ValueError(f"synonym data should have 'entity' and 'role' attributes")

      row["example"] = example.replace(entity, entity_text)
      entity_name_text = entity_name_text + "|" + role_value

      entity_data.append(dict(
          entity=entity_text,
          entity_name=entity_name_text,
          position=(start, end -(len(entity) - len(entity_text))),
      ))

    row["entities"] = entity_data

  return df


In [5]:
def make_dataframe(files: List[str]) -> Tuple[pd.DataFrame, List[str], List[str], Dict[str, str]]:
  """
  params:
  files: list of file location
  return tuple(dataframe, list of entities class name, list of intent class name, synonym dictionary)
  """

  data = []
  synonym_dict = {}
  for file in files:
    data += read_from_yaml(file=file)

  # print(len(data))
  df = pd.DataFrame(columns=["example", "intent", "entities"])

  for intent in data:
    if not intent.get("intent", None):
      if intent.get("synonym", None):
        target_entity = intent["synonym"]

        entities_list_text = intent["examples"]
        if entities_list_text[:2] == "- ":
          entities_list_text = entities_list_text[2:]
        # if entities_list_text[-1:] == "\n":
        #   entities_list_text = entities_list_text[:-1]
        entities_list_text = entities_list_text.rstrip("\n ")

        synonym_entities_list = entities_list_text.split("\n- ")
        for entity in synonym_entities_list:
          synonym_dict[entity] = target_entity

      continue

    intent_name = intent["intent"]
    examples_as_text = intent["examples"]

    if examples_as_text[:2] == "- ":
      examples_as_text = examples_as_text[2:]
    # if examples_as_text[-1:] == "\n":
    #   examples_as_text = examples_as_text[:-1]
    examples_as_text = examples_as_text.rstrip("\n ")

    examples = examples_as_text.split("\n- ")

    new_data = dict(
        intent=[intent_name for i in range(len(examples))],
        example=examples,
        entities=[None for i in range(len(examples))]
    )

    df = pd.concat([df, pd.DataFrame(data=new_data)], ignore_index=True)

  # df = get_entity(df=df)
  # df = get_entity_with_role(df=df)
    
  df, tmp_entities_list = get_entity_new(df=df)

  entities_list = []
  intents_list = []
  for _, row in df.iterrows():
    if row["intent"] not in intents_list:
      intents_list.append(row["intent"])

  for entity_name, roles_list in tmp_entities_list.items():
    if len(roles_list) == 0:
      entities_list.append(entity_name)
    else:
      for role in roles_list:
        if role != "O":
          entities_list.append(entity_name + "|" + role)
        else:
          entities_list.append(entity_name)
  return df, entities_list, intents_list, synonym_dict


In [18]:
files = ["/workspace/nlplab/quannd/scada-full-stack/chatbot-scada/data/nlu.yml"]
df, entities_list, intents_list, synonym_dict = make_dataframe(files)
print(df["example"])
print(entities_list)
print(intents_list)
print(synonym_dict)

0                                                     alo
1                                                chào bot
2                                                   hello
3                                                    chào
4            Xin chào, làm ơn cho phép tôi tự giới thiệu.
                              ...                        
2182    Tìm [tam giác]{"entity": "object", "role": "sh...
2183    Chọn [tam giác]{"entity": "object", "role": "s...
2184    Tìm [tam giác]{"entity": "object", "role": "sh...
2185    Chọn [tam giác]{"entity": "object", "role": "s...
2186    Tìm [tam giác]{"entity": "object", "role": "sh...
Name: example, Length: 2187, dtype: object
['object|length', 'object|height', 'object|size', 'object|angle', 'object|shape', 'object|orientation', 'object|width', 'object|color', 'object', 'object|thickness', 'comparison', 'aspect', 'position|destination_x', 'position|source_y', 'position|destination', 'position|center_x', 'position|destination_y', 'position|s

In [12]:
type(df.loc[df['intent'] == 'draw_line'])

pandas.core.frame.DataFrame

In [19]:
for intent in intents_list:
    print(f"- {intent}")

- greet
- stop
- affirm
- deny
- goodbye
- bot_challenge
- select_all_object
- exit_select
- delete_selected_objects
- select_area
- rotate_left
- rotate_right
- horizontal_flip
- vertical_flip
- move_left
- move_right
- move_up
- move_down
- color_background
- color_foreground
- change_width
- change_height
- change_length
- change_radius
- change_top
- change_left
- change_right
- change_bottom
- draw_line
- draw_circle
- draw_ellipse
- draw_rectangle
- draw_square
- draw_rhombus
- draw_parallelogram
- draw_trapezoid
- draw_arrow
- copy_selected_objects
- cut_selected_objects
- paste
- undo
- redo
- select_object


In [23]:
entity_type = "selected_area"
for entity in entities_list:
    if entity.startswith(entity_type):
        if len(entity.split("|")) > 1:
            print(entity.split("|")[1])

length
height
width


In [9]:
for intent in intents_list:
    print(intent, ": ", len(df.loc[df["intent"] == intent]))

greet :  3
stop :  6
affirm :  5
deny :  5
goodbye :  6
bot_challenge :  7
select_all_object :  50
exit_select :  50
delete_selected_objects :  44
select_area :  40
rotate_left :  54
rotate_right :  54
horizontal_flip :  54
vertical_flip :  54
move_left :  54
move_right :  54
move_up :  54
move_down :  54
color_background :  53
color_foreground :  55
change_width :  54
change_height :  54
change_length :  54
change_radius :  54
change_top :  54
change_left :  64
change_right :  54
change_bottom :  54
draw_line :  89
draw_circle :  70
draw_ellipse :  68
draw_rectangle :  68
draw_square :  67
draw_rhombus :  67
draw_parallelogram :  58
draw_trapezoid :  68
draw_arrow :  68
copy_selected_objects :  49
cut_selected_objects :  54
paste :  45
undo :  48
redo :  49
select_object :  71


In [18]:
df['intent'].value_counts()

intent
select_object              71
draw_line                  70
draw_parallelogram         58
cut_selected_objects       54
rotate_right               54
draw_circle                51
move_left                  51
select_all_object          50
exit_select                50
move_up                    50
draw_rectangle             49
draw_ellipse               49
change_bottom              49
change_right               49
change_top                 49
draw_trapezoid             49
draw_rhombus               49
move_right                 49
copy_selected_objects      49
redo                       49
draw_arrow                 48
undo                       48
draw_square                48
color_background           46
change_height              45
change_left                45
paste                      45
change_width               44
delete_selected_objects    44
move_down                  44
color_foreground           43
rotate_left                43
vertical_flip              41
cha

In [13]:
" ".join(vncore_segmenter.word_segment(df['example'][102]))

NameError: name 'vncore_segmenter' is not defined

In [14]:
for sent in df.loc[df['intent'] == 'draw_rectangle']['example']:
  print("- " + sent)

- Tạo một hình [chữ nhật]{"entity": "value", "role": "shape"} [nằm ngang]{"entity": "value", "role": "orientation"} có [kích thước]{"entity": "aspect"} ([43]{"entity": "value", "role": "length"}, [20]){"entity": "value", "role": "width"} ở [vị trí]{"entity": "aspect"} ([40]{"entity": "position", "role": "source_x"}, [60]{"entity": "position", "role": "source_y"}).
- Vẽ một hình [chữ nhật]{"entity": "value", "role": "shape"} màu [xám]{"entity": "value", "role": "color"} nằm ở [giữa]{"entity": "position", "role": "source"} màn hình.
- Tạo một hình [chữ nhật]{"entity": "value", "role": "shape"} [lớn]{"entity": "value", "role": "size"} màu [trắng]{"entity": "value", "role": "color"} ở [góc dưới bên phải]{"entity": "position", "role": "source"}.
- Vẽ một hình [chữ nhật]{"entity": "value", "role": "shape"} [nhỏ]{"entity": "value", "role": "size"} màu [đen]{"entity": "value", "role": "color"} ở [góc trên bên trái]{"entity": "position", "role": "source"}.
- Tạo một hình [chữ nhật]{"entity": "v

In [None]:
df['segmented_example'] = df.apply(lambda x: " ".join(vncore_segmenter.word_segment(x['example'])), axis=1)

In [15]:
entity_list = set()
for entity in entities_list:
  entity_list.add(entity.split("|")[0])
entity_list

{'aspect',
 'change_action',
 'comparison',
 'intent',
 'object',
 'position',
 'selected_area',
 'value'}

In [16]:
len(intents_list)

42

In [17]:
len(entities_list)

42

In [18]:
for intent in intents_list:
  print("-", intent)

- greet
- stop
- affirm
- goodbye
- bot_challenge
- rotate_left
- rotate_right
- horizontal_flip
- vertical_flip
- move_left
- move_right
- move_up
- move_down
- select_object
- select_all_object
- exit_select
- select_area
- delete_selected_objects
- color_background
- color_foreground
- change_width
- change_height
- change_length
- change_radius
- change_top
- change_left
- change_right
- change_bottom
- draw_circle
- draw_ellipse
- draw_rectangle
- draw_square
- draw_rhombus
- draw_parallelogram
- draw_trapezoid
- draw_line
- draw_arrow
- copy_selected_objects
- cut_selected_objects
- paste
- redo
- undo


In [19]:
print(df["example"][102])

quay phải một [góc nhọn]{"entity": "value", "role": "angle"}


In [20]:
from collections import defaultdict

In [21]:
for sent in df["example"]:
  print(sent)

alo
chào bot
hello
dừng thực hiện tác vụ tại đây
dừng lại
không làm nữa
hủy bỏ tác vụ
stop
tôi không muốn làm cái này nữa
ừ
đúng rồi
đúng là như vậy
chuẩn rùi
chính xác
tạm biệt
hẹn lần sau gặp lại
see ya
bye
pai
paipai
cu có phải bot real ko ?
cậu là sinh viên trường nào thế ?
Hãy giới thiệu bản thân cho tớ nghe đi.
huh ?
Cậu là bot hay là người thế ?
thế cậu là người hay là bot vậy ?
cậu là bot hay là người thế ?
Xoay hình [chữ nhật]{"entity": "object", "role": "shape"} ở [góc trái trên]{"entity": "position", "role": "source"} về bên trái.
Quay hình [ê-líp]{"entity": "object", "role": "shape"} ở [giữa]{"entity": "position", "role": "source"} sang trái
Làm xoay hình [tam giác]{"entity": "object", "role": "shape"} ở [vị trí]{"entity": "aspect"} ([20]{"entity": "position", "role": "source_x"}, [30]{"entity": "position", "role": "source_y"}) qua phía trái
Quay trái hình [vuông]{"entity": "object", "role": "shape"} tại [tọa độ]{"entity": "aspect"} ([50]{"entity": "position", "role": "sour

In [None]:
total = 0
for entities in df["entities"]:
  # print(len(entities))
  total += len(entities)
total

858

# load NLU dataset

In [1]:
pip install torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
from typing import List, Dict, Text, Any, Union

import pandas as pd
import torch

In [None]:
class DIETClassifierDataset:
  def __init__(self, dataframe: pd.Dataframe, tokenizer, entities: List[str], intents: List[str]):

    """
    params:
    - dataframe: dataframe contains ["example", "intent", "entities"] columns
    - tokenizer: pretrained tokenizer from transformers
    - entities: list of entities class names
    - intents: list of intents class names
    """

    dataframe = dataframe[dataframe["intent"].isin(intents)]

    self.entities = ["O"] + entities
    dataframe["entities"] = dataframe["entities"].apply(self._remove_entities)

    self.tokenizer = tokenizer
    self.num_entities = len(self.entities)
    self.intents = intents
    self.num_intents = len(intents)

    sentences = dict(
        sentences = [],
        entities = [],
        intent = []
    )

    for _, row in dataframe.iterrows():
      sentences["sentence"].append(row["example"])
      sentences["entities"].append(row["entities"])
      sentences["intent"].append(row["intent"])

    sentences.update(tokenizer(sentences["sentence"],
                               return_tensors="pt",
                               return_offsets_mapping=True,
                               padding="max_length",
                               truncation=True,
                               max_length=512))

    sentences["entites_labels"] = []

    for index in range(len(sentences["sentence"])):
      entities_labels = []
      for offset in sentences["offset_mapping"][index][1:]: # iterate in a list of dicts or tuples
        is_label = False
        if not (offset[0] == 0 and offset[1] == 0): # if that offet mapping is not in the beginning of sentence
          for entity in sentences["entities"][index]: # iterate in a list of dict entity. a dict entity contains position (tuple), entity_name (str), entity_value (str)
            if entity["possition"][0] <= offset[0] and entity["position"][1] >= offset[1]: # if entity contains >= 1 tokens
              entities_labels.append(self.entities.index(entity["entity_name"])) # assign these contained tokens with entity_name of dict entity
              is_label = True
        if not is_label:
          entities_labels.append(self.entities.index("O")) # assign other tokens with entity "O"

      sentences["entities_labels"].append(entities_labels)

    sentences["entities_labels"] = torch.tensor(sentences["entities_labels"])
    sentences["intent_labels"] = torch.tensor([self.intent.index(intent) for intent in sentences["intent"]])

    self.data = sentences

    def __len__(self, index) -> Dict[Text, Any]:
      item = dict(
          input_ids=self.data["input_ids"][index],
          token_type_ids=self.data["token_type_ids"][index],
          attention_mask=self.data["attention_mask"][index],
          entities_labels=self.data["entites_labels"][index],
          intent_labels=self.data["intent_labels"][index]
      )
      return item

    def _remove_entities(self, entities_list: Union[str, List[str, Any]]) -> List[Dict[str, Any]]:
       if isinstance(entities_list, str):
        try:
          entities_list = json.loads(entities_list)
        except Exception as ex:
          raise RuntimeError(f"Cannot convert entity {entities_list} by error: {ex}")

        entities_list = [entity for entity in entities_list if entity["entity_name"] in self.entities]
        return entities_list

In [None]:
- rotate_left
- rotate_right
- horizontal_flip
- vertical_flip
- move_left
- move_right
- move_up
- move_down
- select_object
- select_area
- delete_object
- delete_selected_objects
- color_all_background
- color_all_object
- color_selected_area
- change_width.
- change_height.
- change_length.
- change_radius.
- draw_circle
- draw_elip
- draw_rectangle
- draw_square
- draw_rhombus
- draw_parallelogram
- draw_trapezoid
- draw_line
- draw_cylinder
- draw_cube
- copy_object
- paste_object