In [None]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import os
from collections import defaultdict
import nltk
import re
import math
import json
from pattern.es import parsetree
from nltk.stem import SnowballStemmer

In [None]:
def get_text(url):
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    begin = text.find('Publicado por:')   #the beginning and ending have common expressions
    end = text.find('Compartir\nAuthor')

    proc_text = text[begin:end]
    return proc_text


def hasNumbers(string):
    return bool(re.search(r'\d', string))


def hasBC(string):
    i = string.find('/')
    return bool(i != -1)


def other_check(token):    
    b1 = not hasNumbers(token)
    b2 = not hasBC(token)
    return (b1 and b2)


def token_and_clean(texto):
    palabras_funcionales = nltk.corpus.stopwords.words("spanish")    
    tokens = nltk.word_tokenize(texto, "spanish")
    tokens_limpios=[]
    for token in tokens:        
        if token not in palabras_funcionales:
            if len(token) > 2 and token not in tokens_limpios:  #>2 helps in filtering out common
                if other_check(token):
                    tokens_limpios.append(token)
    return tokens_limpios


def stem_lemma(word):
    stemmer = SnowballStemmer('spanish')
    word = parsetree(word, lemmata=True)[0].lemmata[0]
    word = stemmer.stem(word) 
    return word

In [None]:
os.chdir('C:\\Users\\rober\\Desktop\\chatbot\\chatbot_server')

from urls import u_IESS, u_RDD, u_JP, u_CONS
from urls import h_IESS, h_RDD, h_JP, h_CONS 


topics = [u_IESS, u_RDD, u_JP, u_CONS]
headers = [h_IESS, h_RDD, h_JP, h_CONS]
texts_data =  defaultdict(lambda: [None, None, None])  #[original_text, depurated_tokens,  header_tokens]

for t,lh in zip(topics,headers):
    for u,h in zip(t,lh):
        text = get_text(u).lower()
        texts_data[u][0] = text
        texts_data[u][1] = token_and_clean(text)
        texts_data[u][2] = token_and_clean(h.lower())


#TF - IDF 
#w_ij = tf_ij * log (N/df_i) 
#w_ij : weight for word i in document j
#tf_ij : number of occurrences of word i in document j
#N : total number of corpuses
# df_i : number of corpuses containing word i
        


N = len(topics[0]) + len(topics[1]) + len(topics[2]) + len(topics[3])   

for u in texts_data.keys():
    text = texts_data[u][0]
    tokens = texts_data[u][1]
    weights = {}    
    for i in tokens:
        tf_ij = text.count(i)
        df_i = 0
        for u2 in texts_data.keys():
            if i in texts_data[u2][0]:
                df_i += 1 
                
        if df_i/N < 0.8:
            weights[stem_lemma(i)] = tf_ij * math.log(N/df_i)       
        
    texts_data[u][1] = weights
    texts_data[u][2] = [stem_lemma(x) for x in texts_data[u][2]]
    
#u = 'https://actuaria.com.ec/que-es-la-bonificacion-por-desahucio/'    
#texts_data[u][1]    
    

    
    
td = dict(texts_data)
json.dump(td, open("texts_data.txt",'w'))
