# NLP NORMALIZER

In [None]:
import os
import sys
import logging
from datetime import datetime
from unidecode import unidecode
import emoji
import re
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from ast import literal_eval as lit_eval

from config_utils import Config, Log
from phrase_modeling.phrase_modeling import PhraseModeling

In [None]:
class Normalizer(object):

    def __init__(self,
                 thr_entity=0.8,
                 thr_intent=0.8,
                 stopwords=stopwords.words('spanish'),
                 bg=False):
        """
        :param thr_entity: float, threshold value for filtering low entity classification scores
        :param thr_intent: float, threshold value for filtering low intent classification scores
        :param entity_filter: dict, Dictionary for mapping entity_type with a frequency of appearances value
        :param punctuation: List of strings, punctuation symbols
        :param stopwords: List of strings, stopwords provided by NLTK
        :param bg: boolean, whether or not to compute phrase modeling over the utterances
        """
        config_path = "/".join(os.path.abspath(__file__).split("/")[:-3]) + "/config/config.ini"
        paths, params, logs = self.get_params(config_path)
        self.thr_entity = thr_entity
        self.thr_intent = thr_intent
        self.stopwords = stopwords
        self.detokenizer = TreebankWordDetokenizer()
        self.punctuation = str.maketrans({key: " " for key in params["punctuation"]})
        self.entity_filter = params["entity_filter"]
        self.bg = bg
        self.not_matched_idx = []

        self.log_level = Normalizer.log_level(logs["log_level"])
        self.logger, self.log_normalizer, self.path_logs = self.__run_log(logs,
                                                                          paths,
                                                                          self.log_level)

    @staticmethod
    def get_params(config_path):
        """
        This function loads parameters from config.ini file
        """
        try:
            config = Config(config_path)

            paths = dict()
            params = dict()
            logs = dict()

            paths["general_path"] = "/".join(os.path.abspath(__file__).split("/")[:-1])
            paths["log_path"] = "/".join(os.path.abspath(__file__).split("/")[:-2]) +\
                                "/logs"

            params["punctuation"] = eval(config.get_config("FILTER_PARAMS", "PUNCTUATION"))
            params["entity_filter"] = eval(config.get_config("FILTER_PARAMS", "ENTITY_FILTER"))

            logs["log_level"] = config.get_config("LOGS", "LOG_LEVEL").replace('"', '')

            return paths, params, logs
        except Exception as e:
            print(e)


    def __run_log(self,
                  logs,
                  paths,
                  log_level):
        """
        This function loads log for the process.
        Creates a output file where is located the log file and others partial results are downloaded
        """
        try:

            output_file_name = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')).replace(" ", "_").replace(":", "_") + \
                               "_normalizer.log"
            paths["log_file_name"] = output_file_name
            logs["log_normalizer"] = paths["log_path"] + "/" + paths["log_file_name"]

            if not os.path.exists(paths["log_path"]):
                os.chmod(paths["general_path"], 0o777)
                os.makedirs(paths["log_path"])
                os.chmod(paths["log_path"], 0o777)

            log = Log("Normalizer_EmbeddingsEvaluation", log_level, logs["log_normalizer"])

            logger = log.set_log()
            return logger, logs["log_normalizer"], paths["log_path"]
        except Exception as e:
            print(e)


    @staticmethod
    def log_level(log_level):
        """
        Parses log level from config file.
        Now posible values are INFO and DEBUG.
        """
        try:
            if str(log_level).replace('"', '').lower() == "debug":
                return logging.DEBUG
            else:
                return logging.INFO
        except Exception as e:
            print(e)


    def normalize(self, df,tqdm_call=tqdm):
        '''
        :param df: pandas DataFrame
        :return df_proc: pandas DataFrame
        '''
        tqdm_call().pandas()

        # Filter rows with column RECOGNIZER_ID == 'private-command-recognizer'
        self.logger.info("Filter rows with column RECOGNIZER_ID == 'private-command-recognizer'")
        df.drop(df.loc[(df.RECOGNIZER_ID == 'private-command-recognizer') | (df['OUTPUT'].astype(str) == 'nan')].index,axis=0,inplace=True)
        df.reset_index(inplace=True)

        # Filter rows by Intent classifier score
        # Create UNDER_THR column where 1 means that the thr has not been exceeded (candidates for being removed) and
        # 0 otherwise
        self.logger.info("Filter rows by Intent classifier score")
        df['UNDER_THR'] = np.where((df['SCORE_NU'] < self.thr_intent) | np.isnan(df['SCORE_NU']),1,0)
        df.drop(df.loc[(df['SCORE_NU'] < self.thr_intent) | (np.isnan(df['SCORE_NU']))].index,axis=0,inplace=True)
        df.reset_index(inplace=True)

        # Create entity map column
        self.logger.info("Creating Entity_map")
        df['ENTITY_MAP'] = df['OUTPUT'].progress_apply(lambda x: self.__get_extraction_entities(x))

        # Apply entity frequency filter
        self.logger.info("Entity frequency filter")
        df['ENTITY_MAP'] = self.__entity_freq_filt(df)

        # Lowercase and remove left and right spaces
        self.logger.info("Lowercasing and removing spaces")
        df['INPUT_PROC'] = df['INPUT'].str.strip().str.lower()

        # Remove diacritical marks, remove punctuation, replace emojis by special token
        self.logger.info("Remove diacritical marks, remove punctuation, replace emojis by special token")
        df['INPUT_PROC'] = \
            df['INPUT_PROC'].progress_apply(lambda x: re.sub(" +"," ",unidecode(str(x)).translate(self.punctuation))
            if emoji.emoji_count(str(x)) == 0 else unidecode(re.sub(r'\:(.+?)\:','<emoji>',emoji.demojize(str(x).translate(self.punctuation)))))

        # Create INPUT_ENTITY column by replacing the entity_string by its entity_type
        # Create INPUT_PROC_BG_1 aggregating multiple-word entities in a single token (bigram, trigram, ...)
        # based on those recognized by NER model
        self.logger.info("Create INPUT_ENTITY ")
        df[['INPUT_ENTITY', 'INPUT_PROC_BG_1']] = df.progress_apply(lambda x: self.__replace_entities(x),
                                                                    axis=1)

        # If the entity was not matched in the input string because some exception occurred, update the UNDER_THR value
        # to 1
        df.loc[df['index'].isin(self.not_matched_idx), 'UNDER_THR'] = 1

        # Phrase modeling if requiered
        if self.bg:
            self.logger.info("Applying Phrase Modeling.")
            bgm = PhraseModeling()
            df = bgm.fit(df)

        return df


    def __replace_entities(self, df):
        """
        Match entity strings with those stored in the entity map for replacing the strings by its entity_type in the
         utterance.
        :param df: pandas DataFrame
        :return ut: string, utterance where the entities that appear in it have been replaced by its tag
        """
        # Processed utterance where we are going to search the entities stored as values in the ent_map
        input_proc = df['INPUT_PROC']
        # Dictionary of ent_type:[ent_str]
        ent_map = df['ENTITY_MAP']
        replace_dict = {}
        replace_dict_2 = {}
        try:
            # Append in a list every ent_str that match with the input_proc
            # TODO: re.sub("_", " ", v) --> v
            matches = [re.search(re.escape(re.sub("_", " ", v)), input_proc) for vlist in ent_map.values() for v in vlist]
            # matches = [re.search(re.escape(v),input_proc) for vlist in ent_map.values() for v in vlist]
            if len(matches) > 0:
                # Construct the dictionary escaped(ent_str):ent_type
                for match in matches:
                    ent_str = input_proc[match.span()[0]:match.span()[1]]
                    for key in ent_map.keys():
                        # TODO: ent_str
                        if re.sub(" ", "_", ent_str) in ent_map[key]:
                            ent_tag = key
                        else:
                            continue
                    replace_dict[re.escape(ent_str)] = "[" + ent_tag + "]"
                    # TODO: replace_dict_2[re.escape(ent_str)] = ent_str
                    replace_dict_2[re.escape(ent_str)] = re.sub(" ", "_", ent_str)
                # Compile the patterns to find (ent_str)
                pattern = re.compile("|".join(replace_dict.keys()))
                pattern_2 = re.compile("|".join(replace_dict_2.keys()))
                # Replace in single step multiple matches
                input_entity = pattern.sub(lambda m: replace_dict[re.escape(m.group(0))], input_proc)
                input_proc_bg_1 = pattern_2.sub(lambda m: replace_dict_2[re.escape(m.group(0))], input_proc)
                return pd.Series([input_entity, input_proc_bg_1])
            else:
                return pd.Series([input_proc, input_proc])
        except Exception as e:
            self.logger.exception(e)
            # Store in a list the indexes which raised an exception for removing them
            self.not_matched_idx.append(df['index'])
            # Log the raised exceptions for future corrections
            self.logger.debug(str(df.idx) + '\t' + str(df.AURA_ID) + '\n' +
                              str(df.INPUT) + '\n' + df.OUTPUT + '\n' + str(df['ENTITY_MAP']) +
                              '\n' + df.INPUT_PROC + '\n\n')


    def __get_extraction_entities(self,json_str):
        """
        Generate dictionary with pairs: entity_type (key): list of entity_str (value)
        :param json_str: string, dictionary format as a string
        :param thr_entity: float, threshold for filtering low score entity values
        :return ent_map: dict, entity_tag(key) : list of entity strings (value)
        """
        ent_map = {}
        try:
            json_obj = json.loads(json_str)
            for d in json_obj['entities']:
                # Filter by entity score. If the thr is not exceeded, the token is not considered as a recognized entity
                if d['score'] >= self.thr_entity:
                    # Normalize the string of the entity
                    value = unidecode(
                        self.detokenizer.detokenize(d['entity'].split(" ")).strip().lower()).translate(
                        self.punctuation)
                    ent_type = d['type'].split(".")
                    if ent_type[0] == 'tef':
                        ent_type[0] = 'ent'
                    ent_type = ".".join(ent_type)
                    if ent_type in ent_map:
                        # TODO: Convert numeric strings to words avoiding the problem when it's treated as float
                        # TODO: ent_map[ent_type].append(value)
                        ent_map[ent_type].append(re.sub(" ", "_", value))
                    else:
                        # TODO: ent_map[ent_type] = [value]
                        ent_map[ent_type] = [re.sub(" ", "_", value)]
                else:
                    continue
        except Exception as e:
            self.logger.exception(e)

        return ent_map


    def build_entity_freq_dict(self,df):
        """
        Construct entity frequency dictionary
        :param df: pandas DataFrame
        :return entities_dict: Dictionary
        """
        entities_dict = {}
        for d in df.ENTITY_MAP:
            if isinstance(d, str):
                d = lit_eval(d)
            for k, v in d.items():
                _dict = entities_dict.get(k, {})
                for val in v:
                    _num_occurrences = _dict.get(val, 0)
                    _dict[val] = _num_occurrences + 1
                entities_dict[k] = _dict
        return entities_dict

    def filter_entity_dict(self,entity_dict):
        return {k: {k2: v2 for k2, v2 in v.items() if v2 > self.entity_filter[k]} for k, v in entity_dict.items()}


    def __entity_freq_filt(self,df):
        """
        Construct frequency dictionary of entities and filter by a min_count parameter
        :param df: pandas DataFrame
        :return df: pandas DataFrame, modified input df removing from entity_map column entities which have not reached
         min_count threshold
        """
        # Build entity frequency dictionary
        entities_dict = self.build_entity_freq_dict(df)

        # Filter entity dict
        self.entities_dict_filt = self.filter_entity_dict(entities_dict)

        # Remove entites which have not reached min_count appearences
        new_entity_map = []
        for d in df['ENTITY_MAP']:
            new_d = {}
            for k, v in d.items():
                for e in v:  # for every value in the list of entities
                    if e in self.entities_dict_filt[k]:
                        if k in new_d:
                            new_d[k].append(e)
                        else:
                            new_d[k] = [e]
                    else:
                        continue
            new_entity_map.append(new_d)

        return new_entity_map
