In [1]:
import pandas as pd
import ipysheet
from utils.nlu_engine import NLUEngine, LR

# Data set cleaning

## Load and overview of data set

In [2]:
def load_data(file_name):
    data_df = pd.read_csv(file_name, sep=';')
    return data_df.dropna(axis=0, how='any', subset=['answer_annotation'])

In [3]:
nlu_data_df = load_data(
    'NLU-Data-Home-Domain-Annotated-All.csv'
    )

There are some issues with the answer_annotation not being similar to the answer_normalised. Therefore, we will make our own answer_normalised from the answer_annotation.

In [4]:
nlu_data_df = NLUEngine.convert_annotated_utterances_to_normalised_utterances(
    nlu_data_df)

In [5]:
number_of_domains = len(nlu_data_df['scenario'].unique())
list_of_domains = nlu_data_df['scenario'].unique()

number_of_intents = nlu_data_df['intent'].nunique()
list_of_intents = nlu_data_df['intent'].unique()

number_of_utterances = nlu_data_df['answer_normalised'].nunique()

print(f'From a total of {number_of_utterances} utterances, there are {number_of_domains} domains, {number_of_intents} intents and {number_of_utterances} utterances.\n')

print(f'List of domains: {list_of_domains}\n')

print(f'List of intents: {list_of_intents}\n')

From a total of 25673 utterances, there are 18 domains, 54 intents and 25673 utterances.

List of domains: ['alarm' 'audio' 'iot' 'calendar' 'play' 'general' 'datetime' 'takeaway'
 'news' 'music' 'weather' 'qa' 'social' 'recommendation' 'cooking' 'email'
 'transport' 'lists']

List of intents: ['set' 'volume_mute' 'hue_lightchange' 'hue_lightoff' 'hue_lighton'
 'hue_lightdim' 'cleaning' 'query' 'music' 'quirky' 'greet' 'convert'
 'remove' 'likeness' 'hue_lightup' 'order' 'settings' 'volume_down' 'joke'
 'dislikeness' 'volume_other' 'coffee' 'volume_up' 'wemo_on' 'wemo_off'
 'stock' 'radio' 'post' 'locations' 'recipe' 'sendemail' 'factoid'
 'events' 'audiobook' 'podcasts' 'ticket' 'movies' 'game' 'traffic'
 'definition' 'querycontact' 'createoradd' 'addcontact' 'taxi' 'maths'
 'currency' 'negate' 'dontcare' 'repeat' 'affirm' 'commandstop' 'confirm'
 'explain' 'praise']



## Intent classification

Both the intents and the domains (scenarios/skills) can be used to label an utterance. In this example we will use domains to label the utterances' intents. 

In [6]:
domains = nlu_data_df.scenario.values

LR_domain_classifier_model, tfidf_vectorizer = NLUEngine.train_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict='domain',
    classifier=LR
)


Training LogisticRegression(random_state=0, solver='liblinear')


Example: Let's try to predict an utterances intent label using the domains.

In [7]:
utterance = "turn off the kitchen lights"

print(NLUEngine.predict_label(
    LR_domain_classifier_model, tfidf_vectorizer, utterance))


Predicting label for utterance: turn off the kitchen lights
iot


### Create intent classifier report

In [8]:
domain_labels = 'scenario'

domain_report_df = NLUEngine.evaluate_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict=domain_labels,
    classifier=LR
)

Evaluating LogisticRegression(random_state=0, solver='liblinear')
Cross validating with LogisticRegression(random_state=0, solver='liblinear')
Time it took to cross validate LogisticRegression(random_state=0, solver='liblinear'): 4.581929445266724
Generating report for LogisticRegression(random_state=0, solver='liblinear')


  df['classifier'] = df['classifier'].str.replace(r"\([^()]*\)", "")


In [9]:
domain_report_df

Unnamed: 0,domain,precision,recall,f1-score,support,classifier,encoding
0,alarm,0.987455,0.88443,0.933108,623.0,LogisticRegression,tfidf
1,audio,0.93808,0.735437,0.82449,412.0,LogisticRegression,tfidf
2,calendar,0.841411,0.91862,0.878322,2986.0,LogisticRegression,tfidf
3,cooking,0.92953,0.657957,0.770515,421.0,LogisticRegression,tfidf
4,datetime,0.846834,0.795297,0.820257,723.0,LogisticRegression,tfidf
5,email,0.94497,0.905329,0.924725,1764.0,LogisticRegression,tfidf
6,general,0.807183,0.828745,0.817822,6102.0,LogisticRegression,tfidf
7,iot,0.964623,0.924642,0.944209,1327.0,LogisticRegression,tfidf
8,lists,0.923243,0.86002,0.890511,993.0,LogisticRegression,tfidf
9,music,0.897375,0.642735,0.749004,585.0,LogisticRegression,tfidf


## Entity extraction

In [10]:
from utils.nlu_engine import EntityExtractor

It is important to have the NLTK tokenizer to be able to extract entities.

In [11]:
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
        nltk.download('punkt')

In [12]:
crf_model = NLUEngine.train_entity_classifier(data_df=nlu_data_df)

Training entity classifier


Example: Let's try an example utterance for entity extraction.

In [13]:
utterance = 'wake me up at five pm this week'

We can get the entity tags of a specific utterance with the EntityExtractor.

In [14]:
EntityExtractor.get_entity_tags(utterance, crf_model)

[('time', 'five'), ('time', 'pm'), ('date', 'this'), ('date', 'week')]

We can also get the entity tagged utterance with the NLUEngine.

In [15]:
entity_tagged_utterance = NLUEngine.create_entity_tagged_utterance(
    utterance, crf_model)

entity_tagged_utterance


'wake me up at [time : five pm] [date : this week]'

### Entity extraction report

Due to this error featured in [this git issue](https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/60) we have to use an older version of scikit learn (sklearn<0.24), otherwise the latest version would work. Hopefully this gets fixed one day..

In [16]:
entity_report_df = NLUEngine.evaluate_entity_classifier(data_df=nlu_data_df)
entity_report_df

Evaluating entity classifier




Cross validating with CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)




Time it took to cross validate CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100): 452.1003019809723


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,entity-type,precision,recall,f1-score,support
0,0,0.879848,0.952758,0.914852,73769.0
1,alarm_type,0.0,0.0,0.0,30.0
2,app_name,0.7,0.460526,0.555556,76.0
3,artist_name,0.496656,0.458333,0.476726,648.0
4,audiobook_author,0.0,0.0,0.0,31.0
5,audiobook_name,0.0,0.0,0.0,301.0
6,business_name,0.372385,0.186192,0.248257,956.0
7,business_type,0.70892,0.404826,0.515358,373.0
8,change_amount,0.5,0.197605,0.283262,167.0
9,coffee_type,1.0,0.14,0.245614,50.0


## Cleaning the dataset
Now that we know what works and what doesn't, we can clean the dataset.

We don't want all of the columns, so we will drop some to review the data set

In [None]:
nlu_scenario_data_df = nlu_data_df.drop(
    columns=[
        'userid', 'notes', 'answer', 'answer_normalised', 'answerid'
        ])


Pick a domain (scenario) to review

In [None]:
nlu_scenario_df = nlu_scenario_data_df[
    nlu_scenario_data_df['scenario'] == 'iot'
    ]


## Convert to ipysheet and review

We shall make two buttons. 
* **review**: Either changes have been made or the entry should be further reviewed
* **remove**: We will drop the entry from the data set.

Look at each utterance, check the following:
* Is the utterance grammatically correct (and spelled correctly)?
* Is the utterance in the correct language?
* Is the utterance in the correct domain?
* Is the utterance in the correct format?
* Does the utterance actually make sense? (i.e. does it make sense to say it?)

If you are unsure, you are marking your changes as **review** anyway, so that's cool.

In [None]:
nlu_scenario_df = nlu_scenario_df.assign(review=None)
nlu_scenario_df['review'] = nlu_scenario_df['review'].astype(bool)

nlu_scenario_df = nlu_scenario_df.assign(remove=None)
nlu_scenario_df['remove'] = nlu_scenario_df['remove'].astype(bool)

nlu_scenario_df_sheet = ipysheet.from_dataframe(nlu_scenario_df)
nlu_scenario_sheet = ipysheet.from_dataframe(nlu_scenario_df)
nlu_scenario_sheet

Once you are done reviewing, you convert it back to a dataframe and check to make sure it looks okay.

In [None]:
reviwed_scenario_df = ipysheet.to_dataframe(nlu_scenario_sheet)
reviwed_scenario_df.index = pd.to_numeric(reviwed_scenario_df.index)
reviwed_scenario_df.tail(50)

If you are sure it is okay, you can save it as a csv file, make sure to name it correctly (i.e. `alarm_domain_first_review.csv`)

In [None]:
reviwed_scenario_df.to_csv('iot_domain_first_review.csv')


Load it back up and check to make sure it looks okay. Make sure to give it the right name!

In [None]:
audio_domain_first_review_df = pd.read_csv(
    'iot_domain_first_review.csv', index_col=0)
audio_domain_first_review_df.tail(50)

In [None]:
# TODO: implement the evaluate_classifier in the NLU engine to check f1 score for intents and entities in the domain vs original NLU data of domain!
# Value: benchmark!

In [None]:
# TODO: cancat all reviewed dfs and save to csv

In [None]:
# TODO: add benchmark for whole NLU data set before and after cleaning! (by intents and domains!)
# TODO: review the review marked entries
# TODO: add new column for notes
# TODO: change flow of review for only ones that should be reviewed, not all of the ones that have been changed (track changes by comparing against the original data set)
# TODO: do the changed utterances have to be changed in other fields too or is it just enough for the tagged utterancve field?

In [None]:
# TODO: add visualizations of domains, their intents, keywords in utterances, and entities