In [None]:
from nlu_engine import NLUEngine
from nlu_engine import DataUtils
from nlu_engine import IntentMatcher, LR
from nlu_engine import EntityExtractor
import nltk

# Example of intent and entity classification with NLU engine class
This is just a small example notebook to help users understand how to use the NLU engine.

* Intent example
* Entity example

Load data set. For this example, we will use the cleaned dataset, although you can load any dataset you like.

In [None]:
nlu_data_df = DataUtils.load_data(
    'data/NLU-Data-Home-Domain-Annotated-All-Cleaned.csv'
)

## Intent classification: example of a single utterance

Both the intents and the domains (scenarios/skills) can be used to label an utterance. In this example we will use domains to label the utterances' intents. 

In [None]:
#TODO: refactor
domains = nlu_data_df.scenario.values

LR_domain_classifier_model, tfidf_vectorizer = NLUEngine.train_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict='scenario',
    classifier=LR
)


In [None]:
intent = nlu_data_df.intent.values

LR_domain_classifier_model, tfidf_vectorizer = NLUEngine.train_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict='intent',
    classifier=LR
)


Example: Let's try to predict an utterances intent label using the domains.

In [None]:
utterance = "turn off the kitchen lights"

print(IntentMatcher.predict_label(
    LR_domain_classifier_model, tfidf_vectorizer, utterance))

## Entity extraction

The entity extraction could be greatly improved by improving the features it uses. It would be great if someone would take a look at this. Perhaps the CRF features similar to what Snips uses would be better such as Brown clustering (probably).

It is important to have the NLTK tokenizer to be able to extract entities.

In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

### Example: Extracting entities from an utterance

In [None]:
crf_model = NLUEngine.train_entity_classifier(data_df=nlu_data_df)

In [None]:
utterance = 'wake me up at five pm this week'

We can get the entity tags of a specific utterance with the EntityExtractor.


In [None]:
EntityExtractor.get_entity_tags(utterance, crf_model)

We can also get the entity tagged utterance with the NLUEngine.


In [None]:
entity_tagged_utterance = NLUEngine.create_entity_tagged_utterance(
    utterance, crf_model)

entity_tagged_utterance

In [None]:
#TODO remove everything from here (perhaps move it into another notebook?), this was just to quickly evaluate entity matching using spaCy for PoS.

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Get busy living or get busy dying.")

print(f"{'text':{8}} {'POS':{6}} {'TAG':{6}} {'Dep':{6}} {'POS explained':{20}} {'tag explained'} ")
for token in doc:
    print(f'{token.text:{8}} {token.pos_:{6}} {token.tag_:{6}} {token.dep_:{6}} {spacy.explain(token.pos_):{20}} {spacy.explain(token.tag_)}')

In [None]:
list_of_words_and_tags = []
for token in doc:
    list_of_words_and_tags.append((token.text, token.tag_))

list_of_words_and_tags


In [None]:
EntityExtractor.pos_tag_utterance(
    utterance="Get busy living or get busy dying.")


In [None]:
entity_reviewed_report_df = NLUEngine.evaluate_entity_classifier(
    data_df=nlu_data_df)

In [None]:
entity_reviewed_report_df.to_csv('data/nltk_pos_entity_report.csv')

In [None]:
entity_reviewed_report_df


In [None]:
from nlu_engine import Analytics
from nlu_engine.entity_extractor import crf

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [None]:
def spacy_pos_tag_utterance(utterance):
    doc = nlp(utterance)
    list_of_words_and_tags = []
    for token in doc:
        list_of_words_and_tags.append((token.text, token.tag_))
    return list_of_words_and_tags

def create_feature_dataset(data_df):
    """
    Creates a feature dataset from the annotated utterances.
    """
    feature_dataset = []
    for utterance, utterance_with_tagging in zip(data_df['answer_normalised'], data_df['answer_annotation']):
        entities = EntityExtractor.extract_entities(utterance_with_tagging)
        utterance_pos = spacy_pos_tag_utterance(utterance)
        feature_dataset.append(
            EntityExtractor.combine_pos_and_entity_tags(entities, utterance_pos))
    return feature_dataset

def get_targets_and_labels(data_df):
    feature_dataset = create_feature_dataset(data_df)
    X = [EntityExtractor.utterance2features(utterance)
            for utterance in feature_dataset]
    y = [EntityExtractor.utterance2labels(utterance)
            for utterance in feature_dataset]

    return X, y

def evaluate_entity_classifier(data_df):
    """
    Evaluates the entity classifier and generates a report
    """

    print('Evaluating entity classifier')

    X, y = get_targets_and_labels(data_df)
    predictions = Analytics.cross_validate_classifier(crf, X, y)
    report_df = Analytics.generate_entity_classification_report(
        predictions, y)
    return report_df


In [None]:
entity_spacy_report_df = evaluate_entity_classifier(nlu_data_df)
entity_spacy_report_df

In [None]:
entity_spacy_report_df.to_csv('data/spacy_entity_report.csv')