In [None]:
import pandas as pd
import numpy as np
from typing import Text, Dict, List
import tqdm
import logging
from privatecog_suggestions.utils import get_intent_code_mapping

logger = logging.getLogger()

_SESSION_LAPSES_MIN = [1, 2, 5, 10, 30, 60]
_NUM_MAX_INTENTS_PER_DOMAIN = 20
INTENT_CODES = get_intent_code_mapping()


In [None]:

def _get_domain(intent_name: Text) -> Text:
    """
    Get the domain name from an intent name
    """
    if "." not in intent_name:
        return intent_name
    return intent_name[:intent_name.index(".")]


In [None]:
class RecognizerLogsFormatter(object):

    def __init__(self, session_lapses_min: List[int] = _SESSION_LAPSES_MIN):
        """
        :param session_lapses_min: List of lapses in minutes to group requests into the same session.
        """
        self.session_lapses_min = session_lapses_min

    def format_logs(self, file_path: Text, output_path: Text = None, output_format="csv", run_in_notebook=False,
                    translation_dict: Dict[Text, int] = INTENT_CODES) -> pd.DataFrame:
        """
        Format a Recognizer logs file into a pandas Dataframe.
        :param file_path:
        :param output_path: path to save the formatted data.
        :param output_format: Possible values: "csv"|"pick"
        :param run_in_notebook: True if this method is intended to be called within a jupyter notebook. False
        otherwise.
        :return Pandas Dataframe
        """
        df = pd.read_csv(file_path)

        # To numeric categorical values...
        df["n_USER_ID"] = df["USER_ID"].astype("category").cat.codes.apply(lambda x: float(x))
        df["n_USER_ID_GLOBAL"] = df["USER_ID_GLOBAL"].astype("category").cat.codes.apply(lambda x: float(x))
        # Convert dates
        df["RECOGNIZER_DT"] = pd.to_datetime(df["RECOGNIZER_DT"])
        # Change UTC time to Spain/Madrid UTC time
        df['RECOGNIZER_DT_CONVERT'] = df['RECOGNIZER_DT'].dt.tz_convert(tz='Europe/Madrid')
        # Build numerical timestamps
        df["TIMESTAMP"] = df['RECOGNIZER_DT_CONVERT'].values.astype(np.int64)
        # Add auxiliary ones column to make counts easier
        df["__ONES"] = np.ones((len(df)))
        # Add domain column
        df["DOMAIN"] = df["INTENT"].apply(lambda s: _get_domain(str(s)))
        # Convert dates
        # df["RECOGNIZER_DT"] = pd.to_datetime(df["RECOGNIZER_DT"])

        # Encode intents
        intent_encoder = IntentEncoder(df, translation_dict=translation_dict)
        df["INTENT_ENCODED"] = df.INTENT.apply(
            lambda s: intent_encoder.encode_intent(str(s)))

        # Order by date
        df = df.sort_values(by=["RECOGNIZER_DT_CONVERT"])

        # Add sessions data
        for lapse in self.session_lapses_min:
            df["SESSION_{}_MIN".format(lapse)] = np.ones(len(df)) * -1

        df_grouped_by_private_id = df.groupby("USER_ID_GLOBAL")
        if run_in_notebook:
            _tqdm = tqdm.tqdm_notebook
        else:
            _tqdm = tqdm.tqdm
        for g_name, _df in _tqdm(df_grouped_by_private_id):
            for lapse in self.session_lapses_min:
                sessions = self._get_sessions_per_user(_df, lapse * 60)
                indices = _df.index
                for i, s in enumerate(sessions):
                    df.at[indices[i], "SESSION_{}_MIN".format(lapse)] = s

        # Save dataframe
        if output_path is not None:
            if output_format.lower() == "csv":
                df.to_csv(output_path)
            elif output_format.lower() in {"pick", "pickle"}:
                df.to_pickle(output_path)
            else:
                raise Exception("Wrong value for argument 'output_format': {}. Valid values are 'csv' | 'pick'".format(
                    output_format))
        return df

    def _get_sessions_per_user(self, df: pd.DataFrame, max_seconds: float = 1 * 60) -> List[int]:
        """
        Get an array of indexes to group related entries. Related entries are those whose time difference is
        less than or equal to max_seconds.
        :param df:
        :param max_seconds:
        :return
        """
        max_nano_seconds = max_seconds * 1e09
        _counter = 0
        res = []
        diff = df["TIMESTAMP"].diff()
        for v in diff.values:
            if v > max_nano_seconds:
                _counter += 1
            res.append(_counter)
        return res


In [None]:
class IntentEncoder(object):
    """
    This class encodes intent names into numerical values.
    The translation dictionary can be retrieved (if not specify in constructor) from translation_dict.
    """

    def __init__(self, df: pd.DataFrame, translation_dict: Dict[Text, int] = None,
                 num_max_intents_per_domain=_NUM_MAX_INTENTS_PER_DOMAIN):
        """
        :param df:
        :param translation_dict: Dictionary used for translation (intent_name -> numerical_value)
        :param num_max_intents_per_domain
        """
        self.num_max_intents_per_domain = num_max_intents_per_domain
        # Sorted domains
        self.domains = sorted(df.DOMAIN.unique())
        # Dictionary of related (ordered) intents grouped by domain
        self.intents_dict = {}
        for intent in df.DOMAIN.unique():
            self.intents_dict[intent] = sorted(df[df.DOMAIN == intent].INTENT.unique())

        if translation_dict is not None:
            self.translation_dict = translation_dict
        else:
            # Build translation dictionary
            self.translation_dict = {}
            for domain, intent_list in self.intents_dict.items():
                for intent in intent_list:
                    self.translation_dict[intent] = self.domains.index(domain) * self.num_max_intents_per_domain + \
                                                    self.intents_dict[domain].index(intent) + 1

        logger.info("Intent translation dictionary: {}".format(self.translation_dict))
        # print("Intents dict:")  # Debug
        # pprint.pprint(self.intents_dict)  # Debug
        # print("Intents translation dict:")  # Debug
        # pprint.pprint(self.translation_dict)  # Debug


In [None]:
def encode_intent(self, intent_name: str):
    """
    Assigns a numeric value to an intent name
    """
    return self.translation_dict.get(intent_name, 0)

In [None]:
# TEST
# TODO: move to unitary test
if __name__ == "__main__":
    file_path = "data/RECOGNIZER-NLP-201812.csv"
    output_path = "/tmp/formatted.csv"

    formatter = RecognizerLogsFormatter()
    df = formatter.format_logs(file_path, output_path=output_path, output_format="csv", run_in_notebook=False)

    print(df)