In [28]:
import csv
import json
import yaml
import os.path
from os import path
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Load dataset

In [29]:
# constants
TEST_EXAMPLE_PERCENTAGE = 0.33
VERSION = 8
NUM_SPLITS = 10

# paths
DATASET_FILE = '../datasets/nlu-home-domain.csv'

# lists
NLU_NAMES = ['watson', 'luis', 'snips', 'rasa-sklearn', 'rasa-diet']
INTENTS = ['factoid', 'music', 'negate', 'query', 'quirky', 'remove', 'set', 'sendemail', 'repeat', 'explain']

In [30]:
data = pd.read_csv(DATASET_FILE, delimiter=';', usecols=lambda x: x.lower() in ['intent', 'scenario', 'answer'])

data = data.loc[data['intent'].isin(INTENTS)]
    
data = data[['answer', 'intent']]
data = data.rename(columns={"answer": "text"})

data

Unnamed: 0,text,intent
0,wake me up at 5am this week,set
1,wake me up at 9am on Friday,set
2,set an alarm for two hours from now,set
24,check when the show starts,query
25,I WANT TO LISTEN ARIJIT SINGH SONG ONCE AGAIN.,music
...,...,...
24926,"s2, i couldn't catch that.",explain
24927,what do you mean by it.,explain
24928,can you clarify more on that.,explain
24929,"s1, rephrase me more on it.",explain


In [31]:
selected_intents = data['intent'].unique()
selected_intents

array(['set', 'query', 'music', 'quirky', 'remove', 'sendemail',
       'factoid', 'negate', 'repeat', 'explain'], dtype=object)

### classes

In [32]:
class literal_unicode(str): pass

def literal_unicode_representer(dumper, data):
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, style='|')

yaml.add_representer(literal_unicode, literal_unicode_representer)

### Functions

In [33]:
def write_to_csv(data, output_csv):
    with open(output_csv, 'w') as f: 
        write = csv.writer(f) 
        write.writerows(data) 

In [34]:
def write_to_json(data, output_file):
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=2)

In [35]:
def write_to_yaml(data, output_yaml):
    with open(output_yaml, 'w') as f:
        yaml.dump(data, f, sort_keys=False)

In [36]:
def get_file_suffix(nlu):
    if nlu=='snips' or nlu=='luis':
        return '.json'
    elif nlu=='watson':
        return '.csv'
    elif 'rasa' in nlu:
        return '.yml'
    else:
        return ''

In [37]:
def get_params(nlu_name, split_num, version=VERSION):
    suffix = get_file_suffix(nlu_name)
    # paths
    output_train_file = '../datasets/' + nlu_name + '/v' + str(version) + '/' + nlu_name +'_split_' + str(split_num) + '_train_v' + str(version) + suffix
    output_test_file = '../datasets/' + nlu_name + '/v' + str(version) + '/' + nlu_name + '_split_' + str(split_num) + '_test_v' + str(version) + '.csv'
    return output_train_file, output_test_file

In [38]:
def preprocess(utterances, nlu='snips'):
    # shuffle examples
    utterances = utterances.sample(frac=1).reset_index(drop=True)
    # drop NaN and None values
    utterances = utterances.loc[utterances['text'] != None]
    utterances = utterances.dropna()
    # clean text
#     utterances['text'] = utterances['text'].str.replace(r'[^\w\s]+', '')
#     utterances['text'] = utterances['text'].str.lower()
    if 'rasa' in nlu:
        # remove ascii chars
        utterances['text'] = utterances.apply(lambda utt : bytes(utt[0], 'utf-8').decode('ascii', 'ignore'), axis = 1)
    return utterances

In [39]:
def get_utterance_json_body(utterance):
    utterance_body = { "data": [{ "text": utterance }] }
    return utterance_body

In [40]:
def get_intent_json_body(intent_examples):
    # utterances body
    utterances = []
    for example in intent_examples:
        utterance_body = get_utterance_json_body(example)
        utterances.append(utterance_body)
        
    # intent body
    intent = {"utterances": utterances}
    return intent

In [41]:
def get_intent_yaml_body(intent_examples):
    intent_examples = intent_examples['text'].values.tolist()
    # prep data for train file .yml
    examples = ""
    for example in intent_examples:
        manipulated_example = "- " + str(example) + "\n"
        examples = examples + manipulated_example
    intent_obj = {'intent': intent, 'examples': literal_unicode(examples)}
    return intent_obj

In [42]:
def initialize_filtered_data(nlu='snips'):
    if nlu=='snips':
        return {}
    elif nlu=='watson' or nlu=='luis':
        return pd.DataFrame([], columns = ['text', 'intent'])
    elif 'rasa' in nlu:
        return {"nlu" : []}
    else:
        pass

In [43]:
def update_filtered_data(filtered_data, intent, intent_examples, nlu='snips'):
    if nlu=='snips':
        intent_examples = intent_examples['text'].values.tolist()
        filtered_data[intent] = get_intent_json_body(intent_examples)
    elif nlu=='watson' or nlu=='luis':
        filtered_data = pd.concat([filtered_data, intent_examples], ignore_index=True, sort=False)
    elif 'rasa' in nlu:
        intent_obj = get_intent_yaml_body(intent_examples)
        filtered_data["nlu"].append(intent_obj)
    
    else:
        pass
    return filtered_data

In [44]:
def build_luis_json(filtered_data, selected_intents):
    filtered_data['entities'] = [[]] * filtered_data.shape[0]
    result = filtered_data.to_json(orient="records")
    parsed = json.loads(result)

    intents = [{"name" : intent, "features" : []} for intent in selected_intents]
    json_data = {"luis_schema_version": "6.0.0",
      "intents": intents,
      "entities": [],
      "hierarchicals": [],
      "composites": [],
      "closedLists": [],
      "prebuiltEntities": [],
      "utterances": parsed,
      "versionId": "0.1",
      "name": "HomeAssistant",
      "desc": "",
      "culture": "en-us",
      "tokenizerVersion": "1.0.0",
      "patternAnyEntities": [],
      "regex_entities": [],
      "phraselists": [],
      "regex_features": [],
      "patterns": [],
      "settings": [
        {
          "name": "UseAllTrainingData",
          "value": "false"
        }]}
    return json_data

In [45]:
def build_snips_json(filtered_data):
    json_data = {
            "entities": {},
            "intents": filtered_data,
            "language": "en"
        }
    return json_data

In [46]:
def write_training_data(filtered_data, output_file, nlu, selected_intents):
    if nlu=='snips':
        json_data = build_snips_json(filtered_data)
        write_to_json(json_data, output_file)
        
    elif nlu=='luis':
        json_data = build_luis_json(filtered_data, selected_intents)
        write_to_json(json_data, output_file)
        
    elif nlu=='watson':
        filtered_data.to_csv(output_file, index=False, header=False)
        
    elif 'rasa' in nlu:
        write_to_yaml(filtered_data, output_file)
        
    else:
        pass

### Re-structure dataset

In [47]:
for split_num in range(NUM_SPLITS):
    seed = random.randint(0, 42)
    for nlu in NLU_NAMES:
        train_data, test_data = train_test_split(data, stratify=data['intent'], test_size=TEST_EXAMPLE_PERCENTAGE, random_state=seed)
        filtered_data = initialize_filtered_data(nlu=nlu)

        for intent in selected_intents:
            # get all examples for this intent
            intent_examples = train_data.loc[train_data['intent'] == intent]
            intent_examples = preprocess(intent_examples, nlu=nlu)

            filtered_data = update_filtered_data(filtered_data, intent, intent_examples, nlu=nlu)
            
        output_train_file, output_test_file = get_params(nlu, (split_num + 1))
        if not path.exists('../datasets/' + nlu):
            os.mkdir('../datasets/' + nlu)
        if not path.exists('../datasets/' + nlu + '/v' + str(VERSION)):
            os.mkdir('../datasets/' + nlu + '/v' + str(VERSION))
        write_training_data(filtered_data, output_train_file, nlu, selected_intents)
        test_data.to_csv(output_test_file, index=False)