In [55]:
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from pattern.es import parsetree
import re
from collections import defaultdict
import numpy as np
import unidecode
from textblob import TextBlob
import time

In [56]:
data = pd.read_csv("Data\\preguntas.csv", sep=',', encoding = "ISO-8859-1")
data.head(3)

Unnamed: 0,Pregunta,respuesta,Tema,Pers/empresa,Polarity
0,Hola una pregunta en que perjudica el acta de ...,,Renuncia/Despido/Desahucio,persona,0
1,PARA CALCULAR EL DESAHUCIO SE DEBE TOMAR EN CU...,,Renuncia/Despido/Desahucio,persona,0
2,supuestamente Correa emitió el decreto 225 dod...,,Jubilacion Patronal,persona,0


In [57]:
data.Tema.value_counts()

Jubilacion Patronal             129
Renuncia/Despido/Desahucio       82
Consultoria                      74
Greeting                         63
IESS                             53
Otros servicios                  44
Contacto                         42
No Topic                         31
Charlas/Capacitaciones           31
Facturacion/Retencion/Cobros     29
Hi Five                          26
Queja                            20
job seeker                       20
Name: Tema, dtype: int64

In [58]:
data['target'] = data.Tema.map({'Jubilacion Patronal': 0, 'Consultoria':1, 'Renuncia/Despido/Desahucio':2, 'IESS':3, 
                                'Greeting':4, 'Contacto':5, 'No Topic':6, 'Queja':7, 'Otros servicios':8, 
                                    'Charlas/Capacitaciones':9, 'Hi Five':10, 'job seeker':11, 'Facturacion/Retencion/Cobros':12})

In [59]:
def hasNumbers(string):
    return bool(re.search(r'\d', string))


def hasBC(string):
    i = string.find('/')
    return bool(i != -1)


def other_check(token):    
    b1 = not hasNumbers(token)
    b2 = not hasBC(token)
    return (b1 and b2)

def remove_accent(word):
    return unidecode.unidecode(word)

def stem_lemma(word):
    stemmer = SnowballStemmer('spanish')
    word = parsetree(word, lemmata=True)[0].lemmata[0]
    word = stemmer.stem(word) 
    return word


def token_and_clean(texto):
    try:
        palabras_funcionales = nltk.corpus.stopwords.words("spanish")    
        tokens = nltk.word_tokenize(texto, "spanish")
        token_list = []
        for token in tokens:        
            if token not in palabras_funcionales:
                token = stem_lemma(token)
                token = remove_accent(token)
            if len(token) >= 2 and other_check(token):
                token_list.append(token)
                
        if not len(token_list) > 0:
            print('No Tokens')
        return token_list
    except:
        print('error en el texto: {0}'.format(texto))
                
    
def analyze_token(texto, tema, tokens_limpios):
    try:
        token_list = token_and_clean(texto)
        for t in token_list:
            t_list = tokens_limpios[t]
            t_list[0] += 1
            t_list[int(tema + 1)] += 1
            tokens_limpios[t] = t_list
        return tokens_limpios
    except:
        print('error en el texto: {0}'.format(texto))

        
def vectorize_phrase(texto, vocab):
    tokens = token_and_clean(texto)
    vector = np.zeros(len(vocab))
    for t in tokens:
        if t in vocab:
            vector[vocab.index(t)] = 1
    return vector
    
    
def n_token(sentence):
    token_list = token_and_clean(sentence) 
    return len(token_list) 


def polarity(message): #blob has a limit on api calls
    blob = TextBlob(message)
    
    try:
        if blob.detect_language() != 'en':
            blob = blob.translate(to='en').lower() 
        else:
            blob = blob.lower() 
    
    except:
        blob = blob.lower()
        
            
    pol = blob.sentiment[0]
    return pol


def percent_greet(sentence):
    tgreet = ['hol', 'buen', 'tard', 'dia', 'noch']
    count = 0
    tokens = token_and_clean(sentence)
    for w in tokens:       
        if w in tgreet:
            count += 1            
    return count/len(tokens) 


In [60]:
tokens_limpios =  defaultdict(lambda: np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) 

In [61]:
for sent, tema in zip(data.Pregunta, data.target):
    tokens_limpios = analyze_token(sent, tema, tokens_limpios)

for key in tokens_limpios:    
    mlista = tokens_limpios[key]
    mlista = np.append(mlista[0], np.max(mlista[1:]/mlista[0]))
    tokens_limpios[key] = mlista    


In [62]:
tokens_limpios 

defaultdict(<function __main__.<lambda>()>,
            {'hol': array([67.        ,  0.26865672]),
             'una': array([158.        ,   0.40506329]),
             'pregunt': array([23.        ,  0.39130435]),
             'en': array([182.        ,   0.23626374]),
             'que': array([210.        ,   0.32380952]),
             'perjudic': array([3., 1.]),
             'el': array([297.        ,   0.32996633]),
             'acta': array([7., 1.]),
             'de': array([5.99000000e+02, 2.75459098e-01]),
             'finiquit': array([9.        , 0.88888889]),
             'por': array([166.        ,   0.31325301]),
             'liquid': array([15.        ,  0.86666667]),
             'par': array([14.        ,  0.28571429]),
             'calcul': array([58.        ,  0.51724138]),
             'desahuci': array([36.        ,  0.83333333]),
             'se': array([60.        ,  0.31666667]),
             'deb': array([37.        ,  0.35135135]),
             'tom': a

In [63]:
counts = 3
cap_pred = 0.1

In [64]:
f_tokens = {k: v for k, v in tokens_limpios.items() if v[0] >= counts and v[1] > cap_pred}
f_tokens

{'hol': array([67.        ,  0.26865672]),
 'una': array([158.        ,   0.40506329]),
 'pregunt': array([23.        ,  0.39130435]),
 'en': array([182.        ,   0.23626374]),
 'que': array([210.        ,   0.32380952]),
 'perjudic': array([3., 1.]),
 'el': array([297.        ,   0.32996633]),
 'acta': array([7., 1.]),
 'de': array([5.99000000e+02, 2.75459098e-01]),
 'finiquit': array([9.        , 0.88888889]),
 'por': array([166.        ,   0.31325301]),
 'liquid': array([15.        ,  0.86666667]),
 'par': array([14.        ,  0.28571429]),
 'calcul': array([58.        ,  0.51724138]),
 'desahuci': array([36.        ,  0.83333333]),
 'se': array([60.        ,  0.31666667]),
 'deb': array([37.        ,  0.35135135]),
 'tom': array([7.        , 0.42857143]),
 'cont': array([7.        , 0.42857143]),
 'valor': array([27.        ,  0.44444444]),
 'hor': array([5. , 0.6]),
 'extro': array([3., 1.]),
 'aliment': array([3., 1.]),
 'corre': array([35. ,  0.4]),
 'emit': array([4.  , 0.75]

In [65]:
vocab = list(f_tokens.keys())
np.save('Data\\vocab.npy', vocab)

In [66]:
X = vectorize_phrase(data.Pregunta[0], vocab)
for sent in data.Pregunta[1:]:
    vector = vectorize_phrase(sent, vocab)
    X = np.vstack((X, vector))    

In [67]:
X.shape

(644, 465)

In [68]:
#tot_sum = np.sum(X, axis =1)
#X = np.column_stack((X, tot_sum))

In [69]:
nt = [n_token(x) for x in data.Pregunta]
X = np.column_stack((X, nt))

In [70]:
X.shape

(644, 466)

In [71]:
pg = [percent_greet(x) for x in data.Pregunta]
X = np.column_stack((X, pg))

In [72]:
X.shape

(644, 467)

In [73]:
pol = []    

In [74]:
#counter = 0
#for sent in data.Pregunta:
#    time.sleep(0.01)
#    p = polarity(sent)
#    pol.append(p)
#    print('{0} of {1}'.format(counter+1,len(data.Pregunta)))
#    counter+=1

pol =  np.load('Data\\pol.npy', allow_pickle=True)

In [75]:
#np.save('Data\\pol.npy', pol)

In [76]:
X = np.column_stack((X, pol))

In [77]:
X.shape

(644, 468)

In [78]:
np.save('Data\\X.npy', X)

In [79]:
#X2 = np.load('X.npy', allow_pickle=True)


In [80]:
y = data.target.to_numpy()

In [81]:
np.save('Data\\y.npy', y)