In [13]:
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from pattern.es import parsetree
import unidecode
import re
from textblob import TextBlob
import json

from urls import u_IESS, u_RDD, u_JP, u_CONS

In [14]:
palabras_funcionales = nltk.corpus.stopwords.words("spanish")    
stemmer = SnowballStemmer('spanish')

def trim_sent(sentence):    
    return ' '.join(sentence.split())

def prepare_text(text): 
    try:
        text = trim_sent(text).lower()
        return text
    except Exception as e:
        print('Exception en prepare_text: {0}'.format(e))
        return None       


def hasNumbers(string):
    return bool(re.search(r'\d', string))


def hasBC(string):
    i = string.find('/')
    return bool(i != -1)


def other_check(token):    
    b1 = not hasNumbers(token)
    b2 = not hasBC(token)
    return (b1 and b2)

def remove_accent(word):
    return unidecode.unidecode(word)

def stem_lemma(word):     
    word = parsetree(word, lemmata=True)[0].lemmata[0]
    word = stemmer.stem(word) 
    return word


def token_and_clean(texto): 
    tokens = nltk.word_tokenize(texto, "spanish")
    token_list = []
    for token in tokens:        
        if token not in palabras_funcionales:
            token = stem_lemma(token)
            token = remove_accent(token)
            if len(token) >= 2 and other_check(token):
                token_list.append(token)                
        
    return token_list   


def vectorize_phrase(texto, vocab):
    try:
        tokens = token_and_clean(texto)
        vector = np.zeros(len(vocab))
        for t in tokens:
            if t in vocab:
                vector[vocab.index(t)] = 1
        return vector
    
    except Exception as e:
        print('Exception en vectorize_phrase: {0}'.format(e))
        return None  
    

def n_token(sentence):
    token_list = token_and_clean(sentence) 
    return len(token_list) 


def polarity_and_lang(message): #blob has a limit on api calls
    
    try:
        if len(message) > 2:
    
            blob = TextBlob(message)    
        
            leng = blob.detect_language()
            text = ''
            if leng == 'es':
                blob = blob.translate(to='en').lower() 
                text = message
            else:
                blob = blob.lower() 
                text = blob.translate(to='es').lower().raw 
            
            pol = blob.sentiment[0]        
        else:
            print('Se paso a polarity_and_lang un texto menor que 3 caracters')
            pol = 0
            text = message            
            
        
    except Exception as e:
            print('Exception en polarity_and_lang: {0}'.format(e))
            pol = 0
            text = None            
    
    return (pol, text)


def percent_greet(sentence):
    tgreet = ['hol', 'buen', 'tard', 'dia', 'noch']
    count = 0
    tokens = token_and_clean(sentence)
    for w in tokens:       
        if w in tgreet:
            count += 1  
    if len(tokens) > 0:
        return count/len(tokens)
    else:
        return 0


def pred_prob(text):
    try:        
    
        vocab = np.load('Data\\vocab.npy', allow_pickle=True)
        vocab = list(vocab)
    
        ldata = np.load('Data\\param_dict.npy', allow_pickle=True)
        param_dict = ldata.item() 
        W0 = param_dict[0].T
        b0 = param_dict[1]
        W1 = param_dict[2].T
        b1 = param_dict[3]
        
        pol, text = polarity_and_lang(text)
        
        if text:
    
            x = vectorize_phrase(text, vocab)
            if x.any():
                x = np.append(x, n_token(text))
                x = np.append(x, percent_greet(text))
                x = np.append(x, pol)   
    
                h0 = np.matmul(x, W0) + b0
                h1 = np.tanh(h0)
                h2 = np.matmul(h1, W1) + b1
                h3 = np.exp(h2)
                prob = h3/np.sum(h3)
        
                return (prob, pol, text)  
            else:
                return(None, None, None)
        
        else:
            return (None, None, None)
    
    except Exception as e:
        print('Exception en predTop_prob: {0}'.format(e))
        return (None, None, None)
    
    
def predict_topic(sentence):
    topics = ['Jubilacion Patronal', 'Consultoria', 'Renuncia/Despido/Desahucio', 'IESS', 
                 'Greeting', 'Contacto', 'No Topic', 'Queja', 'Otros servicios', 'Charlas/Capacitaciones', 
                      'Hi Five', 'job seeker', 'Facturacion/Retencion/Cobros']
    
    sentence = prepare_text(sentence)
    
    try:
    
        if sentence:
            prob, pol, text = pred_prob(sentence)
            if prob.all():
                return (topics[np.argmax(prob)], pol, text)
            else:
                return('No Topic', 0, None) 
        else:
            return('No Topic', 0, None) 
            
    except Exception as e:
        print('Exception en predict_topic: {0}'.format(e))
        return('No Topic', 0, None)  
    
    
def get_score(q, u, td):
    dt = td[u][1]
    dt2 = td[u][2]    
    
    q = prepare_text(q)  #no analysis needed, q already belongs to a topic (previous states)  
    q_tokens =  token_and_clean(q)
    puntaje = 0
    k = 1  #factor for header
    for t in q_tokens:  
        t = stem_lemma(t)
        if t in dt:
            puntaje += dt[t]
        if t in dt2:
            puntaje += dt[t] * k
        
    return puntaje  



def suggest_url(question, topic):    
    
    try:    
        texts_data = json.load(open("Data\\texts_data.txt"))        
        ranking = []
        
        
        if topic == "Jubilacion Patronal":
            ulist = u_JP            
        elif topic == "Renuncia/Despido/Desahucio":
            ulist = u_RDD 
        elif topic == "IESS":
            ulist = u_IESS
        else:
            ulist = u_CONS 
        
        for u in ulist:
            score = get_score(question, u, texts_data)
            ranking.append(score)
            
        if np.max(ranking) > 0:
            url_res = ulist[np.argmax(ranking)]
            return url_res
        else:
            return None           
        
    except Exception as e:
        print('Exception en suggest_url: {0}'.format(e))
        return None      

In [15]:
q = 'Mi esposo se jubilo por invalidez total absoluta Que tipo de indemnización debe tener? O que fondos debería retirar'
topic = 'IESS'

In [16]:
suggest_url(q, topic)

'https://actuaria.com.ec/coberturas-y-requisitos-del-seguro-de-invalidez-vejez-y-muerte-del-iess/'