In [None]:
import pandas as pd
import ipysheet
import nltk

from nlu_engine import NLUEngine
from nlu_engine import MacroDataRefinement
from nlu_engine import DataUtils
from nlu_engine import IntentMatcher, LR
from nlu_engine import EntityExtractor


# Data set cleaning

Load original dataset

In [None]:
nlu_data_df = DataUtils.load_data(
    'NLU-Data-Home-Domain-Annotated-All.csv'
)

Remove all entries with IRR or IRR_XL in column status, these are all incorrect entries.

In [None]:
nlu_data_df = nlu_data_df[~nlu_data_df['status'].str.contains('IRR', na=False)]


There are some issues with the answer_annotation not being similar to the answer_normalised. Therefore, we will make our own answer_normalised from the answer_annotation.

In [None]:
nlu_data_df = DataUtils.convert_annotated_utterances_to_normalised_utterances(
    nlu_data_df)

In [None]:
nlu_data_info_df = MacroDataRefinement.get_data_info(nlu_data_df)
nlu_data_info_df

If there are overlapping intents over domains in the dataset, we will rename them to be unique.

In [None]:
nlu_data_df = MacroDataRefinement.rename_overlapping_intents(nlu_data_df, nlu_data_info_df)

In [None]:
nlu_data_df = DataUtils.upgrade_dataframe(nlu_data_df)

Let's upgrade the dataframe to include the following columns:
* `predicted_label`,
* `intent_refined`,
* `entity_refined`,
* `remove`,

we are going to need them for the next step.

Export the cleaned data set to a csv file and we're done with this notebook!

In [None]:
nlu_data_df = DataUtils.load_data('data/NLU-Data-Home-Domain-Annotated-All-Cleaned.csv')


In [None]:
nlu_data_df.to_csv('data/NLU-Data-Home-Domain-Annotated-All-Cleaned.csv', sep=';')