## Predict Function

In [1]:
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from pattern.es import parsetree
import unidecode
import re
from textblob import TextBlob
#from googletrans import Translator
from translate import Translator
from classifier import SentimentClassifier as sf
               

In [2]:
palabras_funcionales = nltk.corpus.stopwords.words("spanish")    
stemmer = SnowballStemmer('spanish')
#translator = Translator()
#translator = Translator("spanish", "english")
clf = sf() 

def trim_sent(sentence):    
    return ' '.join(sentence.split())

def prepare_text(text): 
    try:
        text = trim_sent(text).lower()
        return text
    except Exception as e:
        print('Exception en prepare_text: {0}'.format(e))
        return None       


def hasNumbers(string):
    return bool(re.search(r'\d', string))


def hasBC(string):
    i = string.find('/')
    return bool(i != -1)


def other_check(token):    
    b1 = not hasNumbers(token)
    b2 = not hasBC(token)
    return (b1 and b2)

def remove_accent(word):
    return unidecode.unidecode(word)

def stem_lemma(word):     
    word = parsetree(word, lemmata=True)[0].lemmata[0]
    word = stemmer.stem(word) 
    return word


def token_and_clean(texto): 
    tokens = nltk.word_tokenize(texto, "spanish")
    token_list = []
    for token in tokens:        
        if token not in palabras_funcionales:
            token = stem_lemma(token)
            token = remove_accent(token)
            if len(token) >= 2 and other_check(token):
                token_list.append(token)                
        
    return token_list   


def vectorize_phrase(texto, vocab):
    try:
        tokens = token_and_clean(texto)
        vector = np.zeros(len(vocab))
        for t in tokens:
            if t in vocab:
                vector[vocab.index(t)] = 1
        return vector
    
    except Exception as e:
        print('Exception en vectorize_phrase: {0}'.format(e))
        return None  
    

def n_token(sentence):
    token_list = token_and_clean(sentence) 
    return len(token_list) 


def polarity_and_lang(message): #blob has a limit on api calls
    
    try:
        if len(message) > 2:
    
            #blob = TextBlob(message)    
        
            #leng = blob.detect_language()
            leng = 'es'
            text = ''
            if leng == 'es':
                #blob = blob.translate(to='en').lower() 
                
                #trans = translator.translate(message)
                
                #trans = translator.translate(message)
                
                
                
                #blob = TextBlob(trans.text).lower()
                #blob = TextBlob(trans).lower()
                pol = clf.predict(message)
                text = message.lower()
            else:
                #blob = blob.lower() 
                #text = blob.translate(to='es').lower().raw 
                pass #not going to happen
            
            
        else:
            print('Se paso a polarity_and_lang un texto menor que 3 caracters')
            pol = 0
            text = message.lower()          
            
        
    except Exception as e:
            print('Exception en polarity_and_lang: {0}'.format(e))
            pol = 0
            text = None            
    
    return (pol, text)


def percent_greet(sentence):
    tgreet = ['hol', 'buen', 'tard', 'dia', 'noch']
    count = 0
    tokens = token_and_clean(sentence)
    for w in tokens:       
        if w in tgreet:
            count += 1  
    if len(tokens) > 0:
        return count/len(tokens)
    else:
        return 0
    
def ex_capac(sentence):
    tcap = ['charl', 'curs', 'capacit', 'seminari', 'formacion', 'capacitacion']
    count = 0
    tokens = token_and_clean(sentence)
    for w in tokens:       
        if w in tcap:
            count += 1  
    return (count > 0)*1


def pred_prob(text):
    try:        
    
        vocab = np.load('Data\\vocab.npy', allow_pickle=True)
        vocab = list(vocab)
    
        ldata = np.load('Data\\param_dict.npy', allow_pickle=True)
        param_dict = ldata.item() 
        W0 = param_dict[0].T
        b0 = param_dict[1]
        W1 = param_dict[2].T
        b1 = param_dict[3]
        
        pol, text = polarity_and_lang(text)
        
        
        if text:
    
            x = vectorize_phrase(text, vocab)
            if x.any():
                x = np.append(x, n_token(text))
                x = np.append(x, percent_greet(text))
                x = np.append(x, ex_capac(text))
                x = np.append(x, pol)   
    
                h0 = np.matmul(x, W0) + b0
                h1 = np.tanh(h0)
                h2 = np.matmul(h1, W1) + b1
                h3 = np.exp(h2)
                prob = h3/np.sum(h3)
        
                return (prob, pol, text)  
            else:
                return(np.zeros(13), pol, text)
        
        else:
            return (np.zeros(13), pol, None)
    
    except Exception as e:
        print('Exception en predTop_prob: {0}'.format(e))
        return (np.zeros(13), None, None)
    
    
def predict_topic(sentence):
    topics = ['Jubilacion Patronal', 'Consultoria', 'Renuncia/Despido/Desahucio', 'IESS', 
                 'Greeting', 'Contacto', 'No Topic', 'Queja', 'Otros servicios', 'Charlas/Capacitaciones', 
                      'Hi Five', 'job seeker', 'Facturacion/Retencion/Cobros']
    
    sentence = prepare_text(sentence)
    
    try:
    
        if sentence:
            prob, pol, text = pred_prob(sentence)
            if prob.all():
                return (topics[np.argmax(prob)], pol, text)
            else:
                return('No Topic', 0, text) 
        else:
            return('No Topic', 0, None) 
            
    except Exception as e:
        print('Exception en predict_topic: {0}'.format(e))
        return('No Topic', 0, None)  
    

In [8]:
text = ('estoy llamando a sus numeros en guayaquil pero no me responden con quien puedo conversar')
print(['Jubilacion Patronal', 'Consultoria', 'Renuncia/Despido/Desahucio', 'IESS', 
                 'Greeting', 'Contacto', 'No Topic', 'Queja', 'Otros servicios', 'Charlas/Capacitaciones', 
                      'Hi Five', 'job seeker', 'Facturacion/Retencion/Cobros'])
print('\n')
print(pred_prob(text))

['Jubilacion Patronal', 'Consultoria', 'Renuncia/Despido/Desahucio', 'IESS', 'Greeting', 'Contacto', 'No Topic', 'Queja', 'Otros servicios', 'Charlas/Capacitaciones', 'Hi Five', 'job seeker', 'Facturacion/Retencion/Cobros']


(array([1.06052497e-03, 1.11177672e-03, 1.83658504e-04, 3.65919019e-03,
       6.34876149e-05, 9.80936761e-01, 9.25394943e-03, 3.25372738e-04,
       3.80385562e-04, 5.00611990e-05, 1.25495621e-04, 2.48210813e-03,
       3.67228362e-04]), 0.06501434725554706, 'estoy llamando a sus numeros en guayaquil pero no me responden con quien puedo conversar')


In [6]:
text = 'estoy llamando a sus numeros en guayaquil pero no me responden con quien puedo conversar'


In [7]:
predict_topic(text)

('Contacto',
 0.06501434725554706,
 'estoy llamando a sus numeros en guayaquil pero no me responden con quien puedo conversar')