In [1]:
import pandas as pd
import ipysheet
from utils.nlu_engine import NLUEngine, LR

# Data set cleaning

## Load and overview of data set

In [2]:
def load_data(file_name):
    data_df = pd.read_csv(file_name, sep=';')
    return data_df.dropna(axis=0, how='any', subset=['answer_annotation'])

In [3]:
nlu_data_df = load_data(
    'NLU-Data-Home-Domain-Annotated-All.csv'
    )

There are some issues with the answer_annotation not being similar to the answer_normalised. Therefore, we will make our own answer_normalised from the answer_annotation.

In [4]:
nlu_data_df = NLUEngine.convert_annotated_utterances_to_normalised_utterances(
    nlu_data_df)

In [6]:
number_of_domains = len(nlu_data_df['scenario'].unique())
list_of_domains = nlu_data_df['scenario'].unique()

number_of_intents = nlu_data_df['intent'].nunique()
list_of_intents = nlu_data_df['intent'].unique()

number_of_utterances = nlu_data_df['answer_normalised'].nunique()

print(f'From a total of {number_of_utterances} utterances, there are {number_of_domains} domains, and {number_of_intents} intents\n')

print(f'List of domains: {list_of_domains}\n')

print(f'List of intents: {list_of_intents}\n')

From a total of 25673 utterances, there are 18 domains, and 54 intents

List of domains: ['alarm' 'audio' 'iot' 'calendar' 'play' 'general' 'datetime' 'takeaway'
 'news' 'music' 'weather' 'qa' 'social' 'recommendation' 'cooking' 'email'
 'transport' 'lists']

List of intents: ['set' 'volume_mute' 'hue_lightchange' 'hue_lightoff' 'hue_lighton'
 'hue_lightdim' 'cleaning' 'query' 'music' 'quirky' 'greet' 'convert'
 'remove' 'likeness' 'hue_lightup' 'order' 'settings' 'volume_down' 'joke'
 'dislikeness' 'volume_other' 'coffee' 'volume_up' 'wemo_on' 'wemo_off'
 'stock' 'radio' 'post' 'locations' 'recipe' 'sendemail' 'factoid'
 'events' 'audiobook' 'podcasts' 'ticket' 'movies' 'game' 'traffic'
 'definition' 'querycontact' 'createoradd' 'addcontact' 'taxi' 'maths'
 'currency' 'negate' 'dontcare' 'repeat' 'affirm' 'commandstop' 'confirm'
 'explain' 'praise']



## Intent classification

### Example of a single utterance

Both the intents and the domains (scenarios/skills) can be used to label an utterance. In this example we will use domains to label the utterances' intents. 

In [7]:
domains = nlu_data_df.scenario.values

LR_domain_classifier_model, tfidf_vectorizer = NLUEngine.train_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict='domain',
    classifier=LR
)


Training LogisticRegression(random_state=0, solver='liblinear')


Example: Let's try to predict an utterances intent label using the domains.

In [9]:
utterance = "turn off the kitchen lights"

print(NLUEngine.predict_label(
    LR_domain_classifier_model, tfidf_vectorizer, utterance))


Predicting label for utterance: wake me up at 10:30 am
alarm


### Create intent classifier report

In [10]:
domain_labels = 'scenario'

domain_report_df = NLUEngine.evaluate_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict=domain_labels,
    classifier=LR
)

Evaluating LogisticRegression(random_state=0, solver='liblinear')
Cross validating with LogisticRegression(random_state=0, solver='liblinear')
Time it took to cross validate LogisticRegression(random_state=0, solver='liblinear'): 6.18346095085144
Generating report for LogisticRegression(random_state=0, solver='liblinear')


  df['classifier'] = df['classifier'].str.replace(r"\([^()]*\)", "")


In [11]:
domain_report_df.sort_values(by=['f1-score'])

Unnamed: 0,domain,precision,recall,f1-score,support,classifier,encoding
12,qa,0.641065,0.883966,0.743171,2370.0,LogisticRegression,tfidf
9,music,0.897375,0.642735,0.749004,585.0,LogisticRegression,tfidf
13,recommendation,0.822414,0.688312,0.749411,693.0,LogisticRegression,tfidf
3,cooking,0.92953,0.657957,0.770515,421.0,LogisticRegression,tfidf
10,news,0.86,0.735462,0.79287,877.0,LogisticRegression,tfidf
6,general,0.807183,0.828745,0.817822,6102.0,LogisticRegression,tfidf
4,datetime,0.846834,0.795297,0.820257,723.0,LogisticRegression,tfidf
1,audio,0.93808,0.735437,0.82449,412.0,LogisticRegression,tfidf
18,accuracy,0.844021,0.844021,0.844021,0.844021,LogisticRegression,tfidf
19,macro avg,0.890284,0.812042,0.844926,25715.0,LogisticRegression,tfidf


## Entity extraction

In [12]:
from utils.nlu_engine import EntityExtractor

It is important to have the NLTK tokenizer to be able to extract entities.

In [None]:
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
        nltk.download('punkt')

### Example: Extracting entities from an utterance

In [13]:
crf_model = NLUEngine.train_entity_classifier(data_df=nlu_data_df)

Training entity classifier


Example: Let's try an example utterance for entity extraction.

In [14]:
utterance = 'wake me up at five pm this week'

We can get the entity tags of a specific utterance with the EntityExtractor.

In [15]:
EntityExtractor.get_entity_tags(utterance, crf_model)

[('time', 'five'), ('time', 'pm'), ('date', 'this'), ('date', 'week')]

We can also get the entity tagged utterance with the NLUEngine.

In [16]:
entity_tagged_utterance = NLUEngine.create_entity_tagged_utterance(
    utterance, crf_model)

entity_tagged_utterance


'wake me up at [time : five pm] [date : this week]'

### Entity extraction report

Due to this error featured in [this git issue](https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/60) we have to use an older version of scikit learn (sklearn<0.24), otherwise the latest version would work. Hopefully this gets fixed one day..

In [17]:
entity_report_df = NLUEngine.evaluate_entity_classifier(data_df=nlu_data_df)

Evaluating entity classifier




Cross validating with CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)




Time it took to cross validate CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100): 621.7124559879303


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
entity_report_df.sort_values(by=['f1-score'])

Unnamed: 0,entity-type,precision,recall,f1-score,support
25,ingredient,0.0,0.0,0.0,52.0
32,music_album,0.0,0.0,0.0,8.0
53,transport_descriptor,0.0,0.0,0.0,35.0
22,game_type,0.0,0.0,0.0,3.0
21,game_name,0.0,0.0,0.0,331.0
54,transport_name,0.0,0.0,0.0,58.0
16,drink_type,0.0,0.0,0.0,23.0
30,movie_name,0.0,0.0,0.0,54.0
14,definition_word,0.0,0.0,0.0,558.0
43,podcast_name,0.0,0.0,0.0,207.0


## Cleaning the dataset
Now that we know what works and what doesn't, we can clean the dataset.

We don't want all of the columns, so we will drop some to review the data set

In [19]:
nlu_scenario_df = nlu_data_df.drop(
    columns=[
        'userid', 'notes', 'answer', 'answer_normalised', 'answerid'
        ])


Pick a domain (scenario) to review

For this example we are going to pick 'alarm'. The intent classification isn't bad, but the entity extraction for alarm_type is terrible. Perhaps it overlaps with another entity type, like 'event_name'. We will try to fix this.

In [20]:
nlu_scenario_df = nlu_scenario_df[
    nlu_scenario_df['scenario'] == 'alarm'
    ]


In [21]:
nlu_scenario_df

Unnamed: 0,scenario,intent,status,answer_annotation,suggested_entities,question
0,alarm,set,,wake me up at [time : five am] [date : this week],"date, time",Write what you would tell your PDA in the foll...
1,alarm,set,,wake me up at [time : nine am] on [date : friday],"date, time",Write what you would tell your PDA in the foll...
2,alarm,set,,set an alarm for [time : two hours from now],"date, time",Write what you would tell your PDA in the foll...
42,alarm,remove,,cancel my [time : seven am] alarm,"date, time",Write what you would tell your PDA in the foll...
43,alarm,remove,,remove the alarm set for [time : ten pm],"date, time",Write what you would tell your PDA in the foll...
...,...,...,...,...,...,...
10721,alarm,set,,alarm [time : five pm] [date : tuesday],"date, event_name, time",Write what you would tell your PDA in the foll...
10722,alarm,set,,set my calendar for an alarm at [time : five p...,"date, event_name, time",Write what you would tell your PDA in the foll...
10727,alarm,set,,set an alarm for [time : four pm] [date : tues...,"event_name, event_requency, time",Write what you would tell your PDA in the foll...
10818,alarm,set,,set alarm for this event reminder [event_name ...,"event_name, event_requency, time",Write what you would tell your PDA in the foll...


As we have seen from the entity extraction report, the entity extraction is not working for the alarm_type.

In [22]:
nlu_scenario_df = nlu_scenario_df[nlu_scenario_df['answer_annotation'].str.contains(
    'alarm_type')]

## Convert to ipysheet and review

We shall make two buttons. 
* **review**: Either changes have been made or the entry should be further reviewed
* **remove**: We will drop the entry from the data set.

Look at each utterance, check the following:
* Is the utterance grammatically correct (and spelled correctly)?
* Is the utterance in the correct language?
* Is the utterance in the correct domain?
* Is the utterance in the correct format?
* Does the utterance actually make sense? (i.e. does it make sense to say it?)

If you are unsure, you are marking your changes as **review** anyway, so that's cool.

In [23]:
nlu_scenario_df = nlu_scenario_df.assign(review=None)
nlu_scenario_df['review'] = nlu_scenario_df['review'].astype(bool)

nlu_scenario_df = nlu_scenario_df.assign(remove=None)
nlu_scenario_df['remove'] = nlu_scenario_df['remove'].astype(bool)

nlu_scenario_df_sheet = ipysheet.from_dataframe(nlu_scenario_df)
nlu_scenario_sheet = ipysheet.from_dataframe(nlu_scenario_df)
nlu_scenario_sheet

Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)


Sheet(cells=(Cell(column_end=0, column_start=0, numeric_format=None, row_end=14, row_start=0, squeeze_row=Fals…

For the example with 'alarm' and the alarm_type: 
* We see that the alarm_type entities are really event_name (ie wake up, soccer practice) except for ID 5879, we will need to change them to event_name and remove ID 5879.
* The last one (ID 6320) is a mistake. Someone got confused with the prompt and assumed alarm is a security system. This is out of scope for the alarm domain, as the alarms are ones set on a phone or other device. We will drop this utterance.

Once you are done reviewing, you convert it back to a dataframe and check to make sure it looks okay.

In [24]:
reviewed_scenario_df = ipysheet.to_dataframe(nlu_scenario_sheet)
reviewed_scenario_df.index = pd.to_numeric(reviewed_scenario_df.index)
reviewed_scenario_df.tail(50)

Unnamed: 0,scenario,intent,status,answer_annotation,suggested_entities,question,review,remove
212,alarm,query,,did i set an alarm to [alarm_type : wake up] i...,,How would you ask your PDA to tell you about t...,False,False
621,alarm,set,,please ring the [alarm_type : wake up] alarm a...,"date, time",How would you ask your PDA to set an alarm?,False,False
877,alarm,remove,,remove [alarm_type : wake up] calls for [date ...,"date, time",How would you ask your PDA to remove an alarm ...,False,False
878,alarm,remove,,cancel [alarm_type : wake up] calls for [date ...,"date, time",How would you ask your PDA to remove an alarm ...,False,False
879,alarm,remove,,stop [alarm_type : wake up] calls for [date : ...,"date, time",How would you ask your PDA to remove an alarm ...,False,False
2297,alarm,query,,is my [alarm_type : wake up] alarm set a [time...,,Write what you would tell your PDA in the foll...,False,False
2298,alarm,query,,what time is set for my [alarm_type : wake up]...,,Write what you would tell your PDA in the foll...,False,False
3805,alarm,set,,send me a [alarm_type : wake up] alert at [tim...,"date, time",Write what you would tell your PDA in the foll...,False,False
3806,alarm,set,,i would like a [alarm_type : wake up] alarm at...,"date, time",Write what you would tell your PDA in the foll...,False,False
4291,alarm,query,,what is the [alarm_type : wake up] time for my...,,Write what you would tell your PDA in the foll...,False,False


Let's change all alarm_type entities to event_name.

In [25]:
reviewed_scenario_df['answer_annotation'] = reviewed_scenario_df['answer_annotation'].str.replace('alarm_type', 'event_name')

In [26]:
reviewed_scenario_df

Unnamed: 0,scenario,intent,status,answer_annotation,suggested_entities,question,review,remove
212,alarm,query,,did i set an alarm to [event_name : wake up] i...,,How would you ask your PDA to tell you about t...,False,False
621,alarm,set,,please ring the [event_name : wake up] alarm a...,"date, time",How would you ask your PDA to set an alarm?,False,False
877,alarm,remove,,remove [event_name : wake up] calls for [date ...,"date, time",How would you ask your PDA to remove an alarm ...,False,False
878,alarm,remove,,cancel [event_name : wake up] calls for [date ...,"date, time",How would you ask your PDA to remove an alarm ...,False,False
879,alarm,remove,,stop [event_name : wake up] calls for [date : ...,"date, time",How would you ask your PDA to remove an alarm ...,False,False
2297,alarm,query,,is my [event_name : wake up] alarm set a [time...,,Write what you would tell your PDA in the foll...,False,False
2298,alarm,query,,what time is set for my [event_name : wake up]...,,Write what you would tell your PDA in the foll...,False,False
3805,alarm,set,,send me a [event_name : wake up] alert at [tim...,"date, time",Write what you would tell your PDA in the foll...,False,False
3806,alarm,set,,i would like a [event_name : wake up] alarm at...,"date, time",Write what you would tell your PDA in the foll...,False,False
4291,alarm,query,,what is the [event_name : wake up] time for my...,,Write what you would tell your PDA in the foll...,False,False


Okay dokey, now we can merge this with the original data set and see if it made a difference already (well of course it did!).

In [27]:
nlu_data_df.drop(
    reviewed_scenario_df[reviewed_scenario_df['remove'] == True].index, inplace=True)

reviewed_scenario_df = reviewed_scenario_df[~reviewed_scenario_df['remove'] == True]

nlu_data_df.loc[nlu_data_df.index.intersection(
    reviewed_scenario_df.index), 'answer_annotation'] = reviewed_scenario_df['answer_annotation']


In [28]:
nlu_data_df[(nlu_data_df['scenario'].str.contains('alarm')) & (nlu_data_df['answer_annotation'].str.contains(
    'event_name'))]


Unnamed: 0,userid,answerid,scenario,intent,status,answer_annotation,notes,suggested_entities,answer_normalised,answer,question
210,5.0,1722.0,alarm,set,,alert me at [time : three pm] to goto the [eve...,,"date, time",alert me at three pm to goto the concert,alert me at 3pm to goto the concert,Write what you would tell your PDA in the foll...
211,5.0,1726.0,alarm,query,,do i have an alarm set for [timeofday : mornin...,,,do i have an alarm set for morning flight,Do I have an alarm set for morning flight?,How would you ask your PDA to tell you about t...
212,5.0,1727.0,alarm,query,,did i set an alarm to [event_name : wake up] i...,,,did i set an alarm to wake up in the morning,Did I set an alarm to wake up in the morning,How would you ask your PDA to tell you about t...
621,68.0,2820.0,alarm,set,,please ring the [event_name : wake up] alarm a...,,"date, time",please ring the wake up alarm at eight am next...,please ring the wake up alarm at 8 am next sat...,How would you ask your PDA to set an alarm?
877,90.0,3532.0,alarm,remove,,remove [event_name : wake up] calls for [date ...,,"date, time",remove wake up calls for this week,remove wake-up calls for this week,How would you ask your PDA to remove an alarm ...
878,90.0,3533.0,alarm,remove,,cancel [event_name : wake up] calls for [date ...,,"date, time",cancel wake up calls for this week,cancel wake-up calls for this week,How would you ask your PDA to remove an alarm ...
879,90.0,3534.0,alarm,remove,,stop [event_name : wake up] calls for [date : ...,,"date, time",stop wake up calls for this week,stop wake-up calls for this week,How would you ask your PDA to remove an alarm ...
2297,311.0,8473.0,alarm,query,,is my [event_name : wake up] alarm set a [time...,,,is my wake up alarm set a six am,Is my wake-up alarm set a 6 am?,Write what you would tell your PDA in the foll...
2298,311.0,8474.0,alarm,query,,what time is set for my [event_name : wake up]...,,,what time is set for my wake up alarm,What time is set for my wake-up alarm?,Write what you would tell your PDA in the foll...
2964,399.0,10635.0,alarm,query,,did i remember to set a reminder alarm for my ...,,,did i remember to set a reminder alarm for my ...,Did I remember to set a reminder alarm for my ...,How would you ask your PDA to tell you about t...


### Benchmark changed data set
TODO: repeat reports for the changed data set for domain and entities.

In [29]:
entity_reviewed_report_df = NLUEngine.evaluate_entity_classifier(data_df=nlu_data_df)
entity_reviewed_report_df.sort_values(by=['f1-score'])

Evaluating entity classifier




Cross validating with CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)




Time it took to cross validate CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100): 608.2919518947601


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,entity-type,precision,recall,f1-score,support
29,movie_name,0.0,0.0,0.0,54.0
47,sport_type,0.0,0.0,0.0,17.0
44,radio_name,0.0,0.0,0.0,966.0
43,query_detail,0.0,0.0,0.0,8.0
42,podcast_name,0.0,0.0,0.0,207.0
41,podcast_descriptor,0.0,0.0,0.0,277.0
31,music_album,0.0,0.0,0.0,8.0
30,movie_type,0.0,0.0,0.0,24.0
52,transport_descriptor,0.0,0.0,0.0,35.0
21,game_type,0.0,0.0,0.0,3.0


If you are sure it is okay, you can save it as a csv file, make sure to name it correctly (i.e. `alarm_domain_first_review.csv`)

In [None]:
reviewed_scenario_df.to_csv('alarm_domain_first_review.csv')


Load it back up and check to make sure it looks okay. Make sure to give it the right name!

In [None]:
audio_domain_first_review_df = pd.read_csv(
    'iot_domain_first_review.csv', index_col=0)
audio_domain_first_review_df.tail(50)

In [None]:
# TODO: implement the evaluate_classifier in the NLU engine to check f1 score for intents and entities in the domain vs original NLU data of domain!
# Value: benchmark!

In [None]:
#TODO: implement a flow for getting the domains with the lowest f1 scores by intent/domain and entities and cleaning them by the order of the lowest f1 scores

In [None]:
# TODO: cancat all reviewed dfs and save to csv

In [None]:
# TODO: add benchmark for whole NLU data set before and after cleaning! (by intents and domains!)
# TODO: review the review marked entries
# TODO: add new column for notes
# TODO: change flow of review for only ones that should be reviewed, not all of the ones that have been changed (track changes by comparing against the original data set)
# TODO: do the changed utterances have to be changed in other fields too or is it just enough for the tagged utterancve field?

In [None]:
# TODO: add visualizations of domains, their intents, keywords in utterances, and entities