# JSON PARSER

In [None]:
def parse_output(df):
	# Convert string to JSON dict
	output_df = df.OUTPUT.apply(lambda x: json.loads(x or '{}'))
	# JSON to DataFrame: Create a dataframe with each key as a new column
	output_df = pd.DataFrame(output_df.to_list())
	output_df = output_df[['score', 'intent', 'intents', 'entities']]  # re-order
	output_df.rename(columns={'score': 'OUTPUT_score_intent',
							  'intent': 'OUTPUT_intent',
							  'intents': 'OUTPUT_intents',
							  'entities': 'OUTPUT_entities'}, inplace=True)  # re-name 'score' columns

	return output_df

In [None]:
def parse_entity(output_df):
	# Unstack 'entities' list of entities: for each utterance, create a new entry per entity detected
	entities_df = output_df['OUTPUT_entities'].apply(lambda x: pd.Series(x)).stack().reset_index(level=1,drop=True).to_frame('ENTITIES_SINGLE_RAW')
	# JSON to DataFrame: Create a dataframe with each key as a new column
	entities_df = pd.DataFrame(entities_df['ENTITIES_SINGLE_RAW'].to_list(),index=entities_df.index)  # explicitly pass the index because when doing .to_list we lose the indices, and we need to keep the reference of exploded/stacked entitites
	entities_df = entities_df[['type', 'entity', 'label', 'canon', 'start_index', 'end_index', 'score']]  # re-order
	entities_df.rename(columns={'score': 'ENTITY_score',
								'type': 'ENTITY_type',
								'entity': 'ENTITY_entity',
								'label': 'ENTITY_label',
								'canon': 'ENTITY_canon',
								'start_index': 'ENTITY_start_index',
								'end_index': 'ENTITY_end_index'}, inplace=True)  # re-name 'score' columns

	return entities_df

In [None]:
# Delete entries with repeated CORR_ID (due to bugs). In the future, this function won't be necessary

def delete_repeated_entries(df):
	# Fix (or bypass) for a bug that assigns the same CORR_ID to different samples (CORR_ID should be a unique identifier)
	corr_id_count = df.groupby('CORR_ID').CORR_ID.count()  # count the appearance of each CORR_ID
	CORR_ID_BUGs = corr_id_count[
		corr_id_count == 2].index.to_list()  # get a list with all CORR_IDs repeated exactly twice
	df = df.drop(df.loc[df.CORR_ID.isin(CORR_ID_BUGs)].index.to_list())  # delete all entries with these CORR_IDs

	# Fix (or bypass) for a bug that repeats n-times an entry with the same CORR_ID, but only the last is valid
	CORR_ID_REP = df.loc[df.duplicated('CORR_ID','last')].index.to_list()  # get a list with the indices of all repeated entries (but the 'last' of each repetition)
	df = df.drop(CORR_ID_REP)  # delete all entries with these CORR_IDs

	# Reset indices to avoid possible problems due to conversions from "list to dataframe", where indices are lost, and posterior concatenations of dataframes
	df = df.reset_index(drop=True)

	return df

In [None]:
# Standarizes columns for all dataframes according to latest version

def standarize_columns(df, filename):
	df_date = filename.split("/")[-1].split("-")[3].split(".")[0]  # get dataframe's year-month

	df = df.rename(columns={'num_file': 'numFile'})  # rename num_file column to numFile
	df = df.reindex(columns=headers_canon)  # add missing columns to avoid columns mismatch when concatenating
	if df_date != '201905':  # for all log files but May (the most complete one according to header's canon)...
		df.userType = 'S'  # ...artificially fill all users as 'S' users (standard users)

	return df

In [None]:
from sklearn import preprocessing
import pandas as pd
import os


def format_logs(data_path):
    ## ENTITIES
    file_name_ent = 'PARSED-ES-RECOGNIZER-NLP-concat.csv.bz2'
    file_path_ent = os.path.join(data_path, file_name_ent)
    # DataFrame loading
    dfEnt = pd.read_csv(file_path_ent, encoding='utf-8', parse_dates=['RECOGNIZER_DT'], dtype={'REASON': str, 'USER_ID_GLOBAL': str, 'INTENT_RAW': str}, sep=',') # necessary to specify dtype because, when parsing this columns, first values from which pandas infer types are empty, and so does not detect dtype properly
    dfEnt['DOMAIN'] = dfEnt.INTENT.str.split('.').str[0] # create a new column indicating the domain of the query/utterance
    dfEnt['RECOGNIZER_DT'] = dfEnt['RECOGNIZER_DT'].dt.tz_convert(tz='Europe/Madrid') # convert UTC time to local time

    ## INTENTS
    file_name_int = 'ES-RECOGNIZER-NLP-concat.csv.bz2'
    file_path_int = os.path.join(data_path, file_name_int)
    # DataFrame loading
    dfInt = pd.read_csv(file_path_int, encoding='utf-8', parse_dates=['RECOGNIZER_DT'], dtype={'REASON': str, 'USER_ID_GLOBAL': str, 'INTENT_RAW': str}, sep=',')
    dfInt['DOMAIN'] = dfInt.INTENT.str.split('.').str[0] # create a new column indicating the domain of the query/utterance
    dfInt['RECOGNIZER_DT'] = dfInt['RECOGNIZER_DT'].dt.tz_convert(tz='Europe/Madrid') # convert UTC time to local time

    ## ENTITIES
    dfEnt = dfEnt.loc[~(dfEnt.INTENT.isnull() & dfEnt.ENTITIES.isnull()) & ~dfEnt.USER_ID_GLOBAL.isna()] # keep only those entries with intent, entity, or intent and entity (all except "empty intent AND empty entity"), and AURA_ID_GLOBAL not NaN
    dfEnt['ENTITY_type'] = dfEnt.ENTITY_type.fillna('intent_but_no_entity') # necessary for posteriory labelEncoder not tu crash (if it receives an empty string, throws an error) [empty entities happen in cases such as tv.on, common.greetings, etc...]. Also, in order to be able to cross fields with dfInt, fill ENTITY_type nulls instead of removing them.

    # Transform non-numerical labels to numerical labels
    le_ent = preprocessing.LabelEncoder() # create label encoder
    numeric_ent = le_ent.fit_transform(dfEnt.ENTITY_type) # fit label encoder with the categorical values/non-numerical labels, and transform all labels to numerical labels
    le_dom_ent = preprocessing.LabelEncoder() # create label encoder
    numeric_dom = le_dom_ent.fit_transform(dfEnt.DOMAIN) # fit label encoder with the categorical values/non-numerical labels, and transform all labels to numerical labels

    # Gather relevant timestamp info
    recognizer_dt = dfEnt.RECOGNIZER_DT # get timestamp for each entry
    time_info = [recognizer_dt.dt.year.rename('year'), recognizer_dt.dt.month.rename('month'), recognizer_dt.dt.weekday.rename('weekday'), recognizer_dt.dt.hour.rename('hour')] # extract year, month, weekday and hour of the day for each entry
    time_info = pd.concat(time_info, axis = 1) # create a dataframe with this information
    recognizer_dt = recognizer_dt.to_frame().join(time_info) # create a unique recognizer_dt dataframe with all relevant information relative to timestamp

    # Build entity-datetime dataframe
    ent_date_df = pd.DataFrame({'entity_num': numeric_ent, 'domain_num': numeric_dom}, index = dfEnt.index).\
        join([recognizer_dt, dfEnt.AURA_ID_GLOBAL, dfEnt.CORR_ID, dfEnt.ENTITY_type, dfEnt.DOMAIN, dfEnt.CHANNEL_CD, dfEnt.RECOGNIZER_ID, dfEnt.userType, dfEnt.RECOGNIZER_DT.dt.strftime('%Y-%m').rename('year_month'), dfEnt.RECOGNIZER_DT.dt.strftime('%Y-%W').rename('year_week')]).\
        rename(columns = {'RECOGNIZER_DT': 'datetime', 'USER_ID_GLOBAL': 'user_id_global', 'CORR_ID': 'corr_id', 'ENTITY_type': 'entity_label', 'DOMAIN': 'domain_label', 'CHANNEL_CD': 'channel', 'RECOGNIZER_ID': 'recognizer_id', 'userType': 'user_type'}) # create a dataframe with entities-datetime-aura ID global info // # strftime('%b %Y'): All days in a new year preceding the first Monday are considered to be in week 0.
    ent_date_df = ent_date_df[['recognizer_id', 'channel', 'corr_id', 'datetime', 'year_week', 'year_month', 'year', 'month', 'weekday', 'hour', 'user_id_global', 'domain_label', 'domain_num', 'entity_label', 'entity_num', 'user_type']] # re-order


    ## INTENTS
    dfInt = dfInt.loc[~(dfInt.INTENT.isnull() & dfInt.ENTITIES.isnull()) & ~dfInt.AURA_ID_GLOBAL.isna()] # keep only those entries with intent, entity, or intent and entity (all except "empty intent AND empty entity"), and AURA_ID_GLOBAL not NaN

    # Transform non-numerical labels to numerical labels
    le_int = preprocessing.LabelEncoder() # create label encoder
    numeric_int = le_int.fit_transform(dfInt.INTENT) # fit label encoder with the categorical values/non-numerical labels, and transform all labels to numerical labels
    le_dom_int = preprocessing.LabelEncoder() # create label encoder
    numeric_dom = le_dom_int.fit_transform(dfInt.DOMAIN) # fit label encoder with the categorical values/non-numerical labels, and transform all labels to numerical labels

    # Gather relevant timestamp info
    recognizer_dt = dfInt.RECOGNIZER_DT # get timestamp for each entry
    time_info = [recognizer_dt.dt.year.rename('year'), recognizer_dt.dt.month.rename('month'), recognizer_dt.dt.weekday.rename('weekday'), recognizer_dt.dt.hour.rename('hour')] # extract year, month, weekday and hour of the day for each entry
    time_info = pd.concat(time_info, axis = 1) # create a dataframe with these information
    recognizer_dt = recognizer_dt.to_frame().join(time_info) # create a unique recognizer_dt dataframe with all relevant information relative to timestamp

    # Build intent-datetime dataframe
    int_date_df = pd.DataFrame({'intent_num': numeric_int, 'domain_num': numeric_dom}, index = dfInt.index).\
        join([recognizer_dt, dfInt.AURA_ID_GLOBAL, dfInt.CORR_ID, dfInt.INTENT, dfInt.DOMAIN, dfInt.CHANNEL_CD, dfInt.RECOGNIZER_ID, dfInt.userType, dfInt.RECOGNIZER_DT.dt.strftime('%Y-%m').rename('year_month'), dfInt.RECOGNIZER_DT.dt.strftime('%Y-%W').rename('year_week')]).\
        rename(columns = {'RECOGNIZER_DT': 'datetime', 'USER_ID_GLOBAL': 'user_id_global', 'CORR_ID': 'corr_id', 'INTENT': 'intent_label', 'DOMAIN': 'domain_label', 'CHANNEL_CD': 'channel', 'RECOGNIZER_ID': 'recognizer_id', 'userType': 'user_type'}) # create a dataframe with entities-datetime-aura ID global info // # strftime('%b %Y'): All days in a new year preceding the first Monday are considered to be in week 0.
    int_date_df = int_date_df[['recognizer_id', 'channel', 'corr_id', 'datetime', 'year_week', 'year_month', 'year', 'month', 'weekday', 'hour', 'user_id_global', 'domain_label', 'domain_num', 'intent_label', 'intent_num', 'user_type']] # re-order

    return ent_date_df, int_date_df


# Path to logs' dataset
input_path = '/home/kike/Documentos/data/logs_recognizer/concat'
ent_df, int_df = format_logs(input_path)

print("Done!")
