Skip to content

Commit

Permalink
Clean dataset generation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Adrien Ball committed May 14, 2018
1 parent 71fca0b commit a05ea9b
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 340 deletions.
2 changes: 0 additions & 2 deletions snips_nlu_dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
from __future__ import absolute_import

from snips_nlu_dataset.assistant_dataset import (
AssistantDataset, main_generate_dataset)
92 changes: 37 additions & 55 deletions snips_nlu_dataset/assistant_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,68 @@

import argparse
import json
import os
from copy import deepcopy

from future.utils import iteritems

from snips_nlu_dataset.custom_entities import CustomEntity
from snips_nlu_dataset.entities import CustomEntity, create_entity
from snips_nlu_dataset.intent_dataset import IntentDataset
from snips_nlu.builtin_entities import is_builtin_entity


class AssistantDataset(object):
"""Dataset of an assistant
Merges a list of :class:AssistantDataset into a single dataset ready to be
used by Snips NLU
Merges a list of :class:`.AssistantDataset` into a single dataset ready to
be used by Snips NLU
Attributes:
:class:AssistantDataset.language: language of the dataset
:class:AssistantDataset.intent_datasets: list of :class:IntentDataset
:class:AssistantDataset.entities: dict of :class:CustomEntity
:class:AssistantDataset.json: The dataset in json format
language (str): language of the assistant
intents_datasets (list of :class:`.IntentDataset`): data of the
assistant intents
entities (list of :class:`.Entity`): data of the assistant entities
"""

def __init__(self, language, intent_datasets, entities):
self.language = language
self.intent_datasets = intent_datasets
self.intents_datasets = intent_datasets
self.entities = entities

@classmethod
def from_files(cls, language, intents_file_names=None,
entities_file_names=None):
"""Creates an :class:AssistantDataset from a language and a list of
text files
The assistant will associate each file to an intent, the name of the
file being the intent name.
"""Creates an :class:`.AssistantDataset` from a language and a list of
intent and entity files
Args:
language (str): language of the assistant
intents_file_names (list of str, optional): names of intent files.
The assistant will associate each file to an intent, the name
of the file being the intent name.
entities_file_names (list of str, optional): names of custom entity
files. The assistant will associate each file to an entity, the
name of the file being the entity name.
"""
if intents_file_names is None:
intents_file_names = []
datasets = [IntentDataset.from_file(language, f) for f in
intents_file_names]
intents_datasets = [IntentDataset.from_file(f)
for f in intents_file_names]

if entities_file_names is None:
entities_file_names = []
entities = {
os.path.splitext(os.path.basename(f))[0]: CustomEntity.from_file(f)
for f in entities_file_names
}
return cls(language, datasets, entities)
entities = [CustomEntity.from_file(f) for f in entities_file_names]
entity_names = set(e.name for e in entities)

# Add entities appearing only in the intents data
for intent_data in intents_datasets:
for entity_name in intent_data.entities_names:
if entity_name not in entity_names:
entity_names.add(entity_name)
entities.append(create_entity(entity_name))
return cls(language, intents_datasets, entities)

@property
def json(self):
intent_datasets_json = {d.intent_name: d.json
for d in self.intent_datasets}
intents = {
intent_name: {
"utterances": dataset_json["utterances"]
}
for intent_name, dataset_json in iteritems(intent_datasets_json)
}
ents = deepcopy(self.entities)
ents_values = dict()
for entity_name, entity in iteritems(self.entities):
ents_values[entity_name] = set(a.value for a in entity.utterances)
if entity.use_synonyms:
ents_values[entity_name].update(
set(t for s in entity.utterances for t in s.synonyms))

for dataset in self.intent_datasets:
for ent_name, ent in iteritems(dataset.entities):
if ent_name not in ents:
ents[ent_name] = ent
elif not is_builtin_entity(ent_name):
for u in ent.utterances:
if u.value not in ents_values:
ents[ent_name].utterances.append(u)
ents = {
entity_name: entity.json
for entity_name, entity in iteritems(ents)
}
return dict(language=self.language, intents=intents, entities=ents)
intents = {intent_data.intent_name: intent_data.json
for intent_data in self.intents_datasets}
entities = {entity.name: entity.json for entity in self.entities}
return dict(language=self.language, intents=intents, entities=entities)


def main_generate_dataset():
Expand All @@ -98,7 +80,7 @@ def main_generate_dataset():
args = parser.parse_args()
dataset = AssistantDataset.from_files(args.language, args.intent_files,
args.entity_files)
print(json.dumps(dataset.json, indent=2))
print(json.dumps(dataset.json, indent=2, sort_keys=True))


if __name__ == '__main__':
Expand Down
16 changes: 0 additions & 16 deletions snips_nlu_dataset/builtin_entities.py

This file was deleted.

80 changes: 0 additions & 80 deletions snips_nlu_dataset/custom_entities.py

This file was deleted.

116 changes: 116 additions & 0 deletions snips_nlu_dataset/entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# coding=utf-8
from __future__ import unicode_literals

import csv
import io
import os
from abc import ABCMeta, abstractmethod

import six
from future.utils import with_metaclass

from snips_nlu.builtin_entities import is_builtin_entity
from snips_nlu.constants import (
VALUE, SYNONYMS, AUTOMATICALLY_EXTENSIBLE, USE_SYNONYMS, DATA)


class Entity(with_metaclass(ABCMeta, object)):
def __init__(self, name):
self.name = name

@abstractmethod
def json(self):
pass


class CustomEntity(Entity):
"""Custom entity of an :class:`.AssistantDataset`
Attributes:
utterances (list of :class:`.EntityUtterance`): entity utterances
automatically_extensible (bool): whether or not the entity can be
extended to values not present in the dataset
use_synonyms (bool): whether or not to map entity values using
synonyms
"""

def __init__(self, name, utterances, automatically_extensible,
use_synonyms):
super(CustomEntity, self).__init__(name)
self.utterances = utterances
self.automatically_extensible = automatically_extensible
self.use_synonyms = use_synonyms

@classmethod
def from_file(cls, entity_file_name):
entity_name = ".".join(
os.path.basename(entity_file_name).split('.')[:-1])
utterances = []
with io.open(entity_file_name, "r", encoding="utf-8") as f:
it = f
if six.PY2:
it = list(utf_8_encoder(it))
reader = csv.reader(list(it))
for row in reader:
if six.PY2:
row = [cell.decode("utf-8") for cell in row]
value = row[0]
if len(row) > 1:
synonyms = row[1:]
else:
synonyms = []
utterances.append(EntityUtterance(value, synonyms))
return cls(entity_name, utterances, automatically_extensible=True,
use_synonyms=True)

@property
def json(self):
"""Returns the entity in json format"""
return {
AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible,
USE_SYNONYMS: self.use_synonyms,
DATA: [u.json for u in self.utterances]
}


class EntityUtterance(object):
"""Represents a value of a :class:`.CustomEntity` with potential synonyms
Attributes:
value (str): entity value
synonyms (list of str): The values to remap to the utterance value
"""

def __init__(self, value, synonyms=None):
self.value = value
if synonyms is None:
synonyms = []
self.synonyms = synonyms

@property
def json(self):
return {VALUE: self.value, SYNONYMS: self.synonyms}


class BuiltinEntity(Entity):
"""Builtin entity of an :class:`.AssistantDataset`"""

@property
def json(self):
return dict()


def utf_8_encoder(f):
for line in f:
yield line.encode("utf-8")


def create_entity(entity_name, utterances=None, automatically_extensible=True,
use_synonyms=True):
if is_builtin_entity(entity_name):
return BuiltinEntity(entity_name)
else:
if utterances is None:
utterances = []
return CustomEntity(entity_name, utterances, automatically_extensible,
use_synonyms)

0 comments on commit a05ea9b

Please sign in to comment.