# NLP

In [None]:
# Load Libraries

import re
import unidecode
import num2words
import unicodedata
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from lemmatizer.token_lemmatizer import TokenLemmatizer
import json
import constants as co

### Functions

In [None]:
def tokenize_text(text, word=True, tokenizer=None):
    """
    Tokenize a sentence using the nltk package
    :param text: input sentence
    :param word:
        if word, tokenize at word level (TreebankWordTokenizer by default)
        if not, tokenize at sentence level (PunktSentenceTokenizer by default)
    :param tokenizer: NLTK Tokenizer object, None by default
    :return: detected tokens
    Examples:
        >>> from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
        >>> text = 'This is an example sentence'
        >>> tokens = tokenize_text(text, word=True)
        >>> ['This', 'is', 'an', 'example', 'sentence']
        >>>
        >>> text = 'First example sentence. Second example sentence'
        >>> tokens = tokenize_text(text, word=False)
        >>> ['First example sentence.', 'Second example sentence']
    """
    if tokenizer is None and word:
        tokenizer = TreebankWordTokenizer()
    elif tokenizer is None and not word:
        tokenizer = PunktSentenceTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


In [None]:

def remove_blanks(tokens):
    """
    Remove blanks in a list of tokens
    :param tokens: list of tokens
    :return: list of tokens without blanks
    Example:
        >>> tokens = ['This', ' is', '   an   ', 'example ', 'sentence']
        >>> remove_blanks(tokens)
        >>> ['This', 'is', 'an', 'an', 'example', 'sentence']
    """
    return [token.strip() for token in tokens]



In [None]:


def remove_accents(sentence):
    """
    Remove accents from all the characters (not single symbols) in a sentence
    :param sentence: sentence
    :return: sentence without accents
    Example:
        >>> import unidecode
        >>> sentence = 'Thís ïs àn éxample ¨¨¨*sentence'
        >>> remove_accents(sentence)
        >>> 'This is an example ```*sentence'
    """
    return unidecode.unidecode(sentence)


In [None]:

def remove_punctuation(sentence, regex=None):
    """
    Remove punctuation from an input sentence
    :param sentence: sentence
    :param regex: r'[^a-zA-Z0-9]' by default
    :return: sentence without punctuation
    Example
        >>> import re
        >>> sentence = 'This is an example ```*sentence'
        >>> remove_punctuation(sentence)
        >>> 'Th s  s  n  xample     sentence'
    """
    if regex is None:
        regex = r'[^a-zA-Z0-9]'
    else:
        regex = regex
    return re.sub(regex, r' ', sentence)

In [None]:

def normalize_unicode_data(sentence):
    sentence = unicodedata.normalize('NFKD', sentence).lower().encode('ascii', errors='ignore').decode('utf-8')
    return sentence

In [None]:
def remove_stopwords(tokens, stopwords_list=stopwords.words('spanish')):
    """
    Remove the stopwords in a list of tokens
    :param tokens: list of tokens
    :param stopwords_list: list of tokens. List of spanish stopwords from NLTK by default
    :return: list of tokens without stopwords
    Example:
        >>> from nltk.corpus import stopwords
        >>> tokens = ['Esta', 'es', 'una', 'frase', 'de', 'ejemplo']
        >>> remove_stopwords(tokens, stop)
        >>> ['Esta', 'frase', 'ejemplo']
    """
    return [token for token in tokens if token not in stopwords_list]

In [None]:
def num_to_words(tokens, lang='es', to='cardinal', with_decimals=False):
    """
    Converts numbers (like 42) to words (like forty-two) using the num2words package.
    It supports multiple languages and also different converters
    (see the documentation (https://pypi.org/project/num2words/)
    :param tokens: list of tokens
    :param lang: see the documentation to see the supported values
    :param to: Supported values: 'cardinal', 'ordinal', 'ordinal_num', 'year', 'currency'
    :param with_decimals: number with decimals
    :return: list of tokens with the numbers converted to words
    Example:
        >>> import num2words
        >>> tokens = ['1', '2', '100', '-1']
        >>> num_to_words(tokens)
        >>> ['uno', 'dos', 'cien', '-1']
        >>>
        >>> tokens = ['1992']
        >>> num_to_words(tokens, to='year')
        >>> ['mil novecientos noventa y dos']
    """
    if with_decimals:
        tokens = [num2words.num2words(token, lang=lang, to=to) if token.isdigit() else token for token in tokens]
    else:
        tokens = [num2words.num2words(int(token), lang=lang, to=to) if token.isdigit() else token for token in tokens]
    return tokens

In [None]:
def tokens_stemming(tokens, stemmer=PorterStemmer()):
    """
    Stem a list of tokens.
    :param tokens: list of tokens
    :param stemmer: stemmer to use. PorterStemmer by default
    :return: list of computed lexical roots, if not, the tokens
    Example:
        >>> from nltk.stem import PorterStemmer
        >>> tokens = ['pelicula', 'peliculas']
        >>> tokens_stemming(tokens)
        >>> ['pelicula', 'pelicula']
    """
    return [stemmer.stem(token) for token in tokens]

In [None]:
def normalize_censured_swearwords(sentence, alias='insult'):
    """
    Standardize all the censored swearwords with an alias
    :param sentence: input sentence
    :param alias: desired tag to replace the censored word
    :return: sentence with censored swearwords standardized with the alias
    """
    sentence = word_tokenize(sentence)
    for idx, word in enumerate(sentence):
        if '*' in word and word[0].isalpha():
            word[idx] = alias
    sentence = ' '.join(sentence)
    return sentence

In [None]:
def wordnet_token_lemmatization(tokens, lemmatizer=WordNetLemmatizer()):
    """
    Lemmatize a list of tokens. Does not work very well for spanish
    :param tokens: list of tokens
    :param lemmatizer:lemmatizer to use. WordNetLemmatizer to use
    :return: list of detected lexical roots, if not, the tokens
    Example:
        >>> from nltk.stem import WordNetLemmatizer
        >>> tokens = ['pelicula', 'peliculas']
        >>> tokens_lemmatization(tokens)
        >>> ['pelicula', 'peliculas']
    """
    return [lemmatizer.lemmatize(token) for token in tokens]



In [None]:
def token_lemmatization(sentence, language):
    """
    Lemmatize the words in a sentence using the lemmatizer module
    :param sentence: input sentence
    :param language: desired language. Only available language Spanish for the moment
    :return: sentence with the lemmas
    """
    lemmatizer = TokenLemmatizer(language)
    sentence = lemmatizer.lemmatize(sentence)
    return sentence

In [None]:

def extract_entities_from_recognizer_data(json_str):
    """
    >>> get_extraction_entities('{"score":0.4292148,"intent":"tef.int.es_ES.mp.tv.search","intents":[{"intent":"tef.int.es_ES.mp.tv.search","score":0.4292148}],"entities":[{"type":"tef.audiovisual_tvseries_title","entity":"el joven Sheldon","label":"","canon":"el joven sheldon","start_index":0,"end_index":16,"score":0.9999999999950252}]}')
    :param json_str:
    :return: (string_entity, entity_canon, entity_type)
    """
    values = []
    try:
        json_obj = json.loads(json_str)
        values = sorted([(' '.join([ut.convert_num_string(y) for y in x['entity'].split()]), '_'.join(x['canon'].split()), x['type'].split('.')[1]) for x in json_obj['entities']])
    except:
        pass

    return values


In [None]:

def extract_entities_from_ner(phrase, pkg, ner):
    """
    >>> from privatecog_ner import Ner
    >>> from privatecog_ner import VERSION
    >>> from pkg_resources import parse_version
    >>> from privatecog_utils import DEFAULT_CONFIG_NAME
    >>> from privatecog_utils import VERSION as UTILS_VERSION
    >>> from privatecog_ner import Ner
    >>> from privatecog_lib.config.auconfig import privateConfig
    >>> from privatecog_utils.misc.notifier import Notifier
    >>> from privatecog_utils.repo.model import pricateNlpModelRepository
    >>> cfg = privateConfig(DEFAULT_CONFIG_NAME)
    >>> ntf = Notifier(level='info')
    >>> nlp_repo = privateNlpModelRepository(cfg, 'es-es', 'mh', notifier=ntf)
    >>> latest_version = nlp_repo.latest()
    >>> pkg = nlp_repo.open(latest_version)
    >>> metadata = pkg.release()
    >>> ner = Ner('crf', model_data=pkg)
    >>> #ner = Ner('gazetteer', model_data=pkg)
    >>> detect_entities_dictionary(sentence, pkg, ner)
    """
    # Open an standard (CRF) NER from the data in the NLP package
    dict_entities = None
    dict_entities = ner(phrase)['entities']
    return [(' '.join([num_to_words(y) for y in x['entity'].split()]), '_'.join(x['canon'].split()),
             x['type'].split('.')[1]) for x in dict_entities]



In [None]:
def transform_input_canon(phrase, list_entities):
    """
    >>> transform_input_canon('quiero ver el real madrid', [('real madrid', 'real_madrid', 'ent.audiovisual_sports_team')])
    :param phrase:
    :param list_entities:
    :return:
    """

    value = phrase.lower()
    for i in range(0, len(list_entities)):
        value = value.replace(list_entities[i][0].lower(), "_".join(list_entities[i][1].lower().split(" ")))
    return value



In [None]:
def transform_input_type(phrase, list_entities):
    """
    >>> transform_input_type('quiero ver el real madrid', [('real madrid', 'real_madrid', 'ent.audiovisual_sports_team')])
    :param phrase:
    :param list_entities:
    :return:
    """
    value=phrase
    for i in range(0, len(list_entities)):
        value = value.replace(ut.delete_punctuation(ut.convert_num_string(list_entities[i][1])).lower(), list_entities[i][2])
    return value


In [None]:
def set_schedule_time(hour):
    """
    >>> set_schedule_time(14)
    :param hour:
    :return:
    """
    if 6 < hour >= 23:
        return "early morning"
    if 6 >= hour < 12:
        return "morning"
    if 12 >= hour < 16:
        return "noon"
    if 16 >= hour < 20:
        return "afternoon"
    if 20 >= hour < 23:
        return "night"

In [None]:

def extract_topic_from_entities(obj, key):
    """
    >>> extract_topic_from_entities(co.dict_entities_topic, 'audiovisual_film_title')
    :param obj:
    :param key:
    :return:
    """
    arr = []
    results = []

    def extract(obj, arr, key):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (list)):
                    extract(v, arr, key)
                if (key in obj[k]):
                    arr.append(k)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    for entity in key:

        results += extract(obj, arr, entity)
    try:
        unique_result = list(set(results))[0]
    except Exception as e:
        unique_result='None'
    return unique_result

In [None]:
import os
import lemmatizer.constants as const


class TokenLemmatizer:
    def __init__(self, language):
        self.language = language
        self.lemmatizer = self.get_lemmas_dict()

    def get_lemmas_dict(self):
        lemmas_dict = {}
        with open(os.path.join(const.LEMMAS_PATH, const.LANGUAGE_FILES_MAPPING.get(self.language))) as f:
            for line in f:
                (key, val) = line.split()
                lemmas_dict[str(val)] = key
        return lemmas_dict

    def lemmatize(self, sentence):
        sentence = [self.lemmatizer.get(word, word) for word in sentence.split()]
        return sentence
