In [1]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import os
from collections import defaultdict
import nltk
import re
import math
import json
from pattern.es import parsetree
from nltk.stem import SnowballStemmer
import numpy as np

from urls import u_IESS, u_RDD, u_JP, u_CONS, u_OS
from urls import h_IESS, h_RDD, h_JP, h_CONS, h_OS 

In [22]:
def get_text(url, begin_t, end_t):
    try:
        html = urllib.urlopen(url).read()
        soup = BeautifulSoup(html)

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()    # rip it out

        # get text
        text = soup.get_text()

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)

        begin = text.find(begin_t)   #the beginning and ending have common expressions
        end = text.find(end_t) + len(end_t)

        proc_text = text[begin:end]
        return proc_text
    
    except Exception as e:
        print('Exception en get_text, url:{0} y codigo: {1}'.format(url, e))
        return None     


def hasNumbers(string):
    return bool(re.search(r'\d', string))


def hasBC(string):
    i = string.find('/')
    return bool(i != -1)


def other_check(token):    
    b1 = not hasNumbers(token)
    b2 = not hasBC(token)
    return (b1 and b2)


def token_and_clean(texto):
    try:
        palabras_funcionales = nltk.corpus.stopwords.words("spanish")    
        tokens = nltk.word_tokenize(texto, "spanish")
        tokens_limpios=[]
        for token in tokens:        
            if token not in palabras_funcionales:
                if len(token) > 2 and token not in tokens_limpios:  #>2 helps in filtering out common
                    if other_check(token):
                        tokens_limpios.append(token)
        return tokens_limpios
    except Exception as e:
        print('Exception en token and clean: texto: {0} y codigo {1}'.format(texto, e))
        return None    


def stem_lemma(word):
    stemmer = SnowballStemmer('spanish')
    word = parsetree(word, lemmata=True)[0].lemmata[0]
    word = stemmer.stem(word) 
    return word



def get_score(q, u, td):
    try:
        dt = td[u][1]
        dt2 = td[u][2]    
        q_tokens =  token_and_clean(q.lower())
        puntaje = 0
        k = 10  #factor for header
        for t in q_tokens:  
            t = stem_lemma(t)
            if t in dt:
                puntaje += dt[t]
            if (t in dt2) and (t in dt):
                puntaje += dt[t] * k
        
        return puntaje/len(q_tokens)  
    except Exception as e:
        print('Exception en get_score: q: {0}, u: {1} y codigo: {2}'.format(q, u,  e))
        return None      



def suggest_url(question, topic):    
    
    try:    
        texts_data = json.load(open("Data\\texts_data.txt"))        
        ranking = []
        
        
        if topic == "Jubilacion Patronal":
            ulist = u_JP            
        elif topic == "Renuncia/Despido/Desahucio":
            ulist = u_RDD 
        elif topic == "IESS":
            ulist = u_IESS            
        elif topic == "Consultoria":
            ulist = u_CONS
        else: #otros servicios
            ulist = u_OS
        
        for u in ulist:
            score = get_score(question, u, texts_data)
            ranking.append(score)
            
        if topic == "Consultoria":
            np.max(ranking) >= 4:
            url_res = ulist[np.argmax(ranking)]            
            return url_res        
        elif np.max(ranking) > 0:
            url_res = ulist[np.argmax(ranking)]            
            return url_res
        else:
            return None           
        
    except Exception as e:
        print('Exception en suggest_url: {0}'.format(e))
        return None      
    
    




In [23]:
from urls import u_IESS, u_RDD, u_JP, u_CONS, u_OS
from urls import h_IESS, h_RDD, h_JP, h_CONS, h_OS 


topics = [u_IESS, u_RDD, u_JP, u_CONS, u_OS]
headers = [h_IESS, h_RDD, h_JP, h_CONS, h_OS]
texts_data =  defaultdict(lambda: [None, None, None])  #[original_text, depurated_tokens,  header_tokens]

for t,lh in zip(topics,headers):
    for u,h in zip(t,lh):
        text = get_text(u, 'Publicado por:', 'Compartir\nAuthor').lower()
        texts_data[u][0] = text
        texts_data[u][1] = token_and_clean(text)
        texts_data[u][2] = token_and_clean(h.lower())
        
#correct for others
u = 'https://actuaria.com.ec/servicio/mercado-asegurador-y-medicina-prepagada/'
text = get_text(u, 'Mercado', 'riesgo.').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())

u = 'https://actuaria.com.ec/servicio/data-analytics/'
text = get_text(u, 'Data', 'estadísticos/predictivos.').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())

u = 'https://actuaria.com.ec/servicio/asesoria-actuarial-empresarial/'
text = get_text(u, 'Asesoría', 'flujos').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())

u = 'https://actuaria.com.ec/servicio/fondos-de-jubilacion-y-cesantia/'
text = get_text(u, 'Fondos', 'parámetros').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())

u = 'https://actuaria.com.ec/servicio/modelos-estadisticos-matematicos/'
text = get_text(u, 'Modelos', 'otros)').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())

u = 'https://actuaria.com.ec/servicio/gestion-humana/'
text = get_text(u, 'Gestión', 'otros.').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())

u = 'https://actuaria.com.ec/servicio/asesoria-financiera/'
text = get_text(u, 'Asesoría', 'otros.').lower()
texts_data[u][0] = text
texts_data[u][1] = token_and_clean(text)
texts_data[u][2] = token_and_clean(h.lower())





#TF - IDF 
#w_ij = tf_ij * log (N/df_i) 
#w_ij : weight for word i in document j
#tf_ij : number of occurrences of word i in document j
#N : total number of corpuses
# df_i : number of corpuses containing word i
        


N = len(topics[0]) + len(topics[1]) + len(topics[2]) + len(topics[3])   

for u in texts_data.keys():
    text = texts_data[u][0]
    tokens = texts_data[u][1]
    weights = {}    
    for i in tokens:
        tf_ij = text.count(i)
        df_i = 0
        for u2 in texts_data.keys():
            if i in texts_data[u2][0]:
                df_i += 1 
                
        if df_i/N < 0.8:
            weights[stem_lemma(i)] = tf_ij * math.log(N/df_i)       
        
    texts_data[u][1] = weights
    texts_data[u][2] = [stem_lemma(x) for x in texts_data[u][2]]
    
#u = 'https://actuaria.com.ec/que-es-la-bonificacion-por-desahucio/'    
#texts_data[u][1]     
    
td = dict(texts_data)
json.dump(td, open("Data\\texts_data.txt",'w'))


In [43]:
q = 'Buenas tardes, como calculo el valor que debo pagar de jubilacion patronal'
suggest_url(q, "Jubilacion Patronal")

27.98308036250376


'https://actuaria.com.ec/aspectos-importantes-en-el-calculo-de-finiquito-de-jubilacion-patronal/'

In [None]:
Mi interés está relacionado con consultas sobre: Desarrollo de Productos de Seguro Diseño y acompañamiento en la aprobación de nuevos productos y planes de seguros