In [1]:
import re
import csv
from urllib import parse as urlparse
from urllib import request as urlrequest
from reppy.cache import RobotsCache
import urllib
import time
from datetime import datetime

class Throttle:
    """Throttle downloading by sleeping between requests to same domain
    """
    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}
        
    def wait(self, url):
        """Delay if have accessed this domain recently
        """
        domain = urlparse.urlsplit(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()


def download(url, headers, num_retries, data=None):
    print ('Downloading:', url)
    request = urlrequest.Request(url, data, headers)
    opener = urlrequest.urlopen
    try:
        response = opener(request)
        html = response.read().decode("utf8")
        code = response.code
    except urllib.error.URLError as e:
        print ('Download error:', e.reason)
        html = ''
        if hasattr(e, 'code'):
            code = e.code
            if num_retries > 0 and 500 <= code < 600:
                # retry 5XX HTTP errors
                html = download(url, headers, num_retries-1, data)
        else:
            code = None
    return html

def setRobot(url): 
    cache = RobotsCache(capacity=100)
    return cache

In [2]:
##S'estableixen els paràmetres de descàrrega
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
headers = {}
headers['User-agent'] = user_agent
sitemap = "https://www.fitnessrevolucionario.com/post-sitemap.xml"
rp = setRobot("https://www.fitnessrevolucionario.com/robots.txt")

##Es baixa el "sitemap" i s'afegeixen totes les pàgines d'interès a una llista "pages"
html = download(sitemap, headers, num_retries=3)
links = re.findall('<loc>(.*?)</loc>', html)
pages = []
for link in links:
    if re.search('episodio', link):
        pass
    elif re.search('/[0-9][0-9][0-9][0-9]/[0-9][0-9]/[0-9][0-9]/', link):
        pages.append(link)

Downloading: https://www.fitnessrevolucionario.com/post-sitemap.xml


In [3]:
from bs4 import BeautifulSoup as bs

#Es crea un "delayer" per no sobrecarregar la pàgina web.
throttle = Throttle(2)

#Es guardaran totes les referències dins el mateix domini. És a dir, els enllaços entre article de Fitness Revolucionario
#La idea és analitzar les relacions entre articles.
referencies = []
#També es vol fer un núvol de tòpics, analitzar els sentiments dels lectos a partir dels comentaris.
#Per cada pàgina, es guarda l'enllaç, el títol de la pàgina, el número de comentaris, i els comentaris de l'article (anònims).
with open('fitness_revo_full.csv', 'w', newline='',encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    fields = ('url', 'date', 'title', 'numComment', 'fullComments')
    writer.writerow(fields)

    for page in pages:
        #Si la pàgina està permesa al robots.txt
        if rp.allowed(page,user_agent):
            #S'espera (si cal) i es descarrega
            throttle.wait(page)
            try:
                #Descàrrega per BeautifulSoup
                page_html = download(page, headers, num_retries=3)
                soup = bs(page_html)

                ##Títol de l'article
                title = soup.title.getText()[:-25]

                #Número de comentaris
                comments = soup.find(id="comments_wrapper").h4.getText()
                nC = "0"
                if comments == "No Comments":
                    pass
                else:
                    nC = comments[:-9]
                ##Llistat a filtrar
                bad = ("\n","\r",",",".","\t","?","!",'"',"“","(",")","¿",":",";","…","”","–","/","https","-","http","www","*")
               
                #Comentaris de l'article
                com = soup.find_all("div",attrs={"class":"comment-text"})
                fullComments = []
                for c in com:
                    t = c.p.getText().lower()
                    for l in bad:
                        t = t.replace(l, " ")
                    fullComments.append(t)
                write_comments = "*".join(fullComments)
                
                #Data
                date = page[38:48] 
                
                #Referències pròpies de la pàgina
                ref = []
                text = soup.find("div",attrs={"class":"post-entry"})
                for a in text.find_all('a', href=True):
                    if re.search('https://www.fitnessrevolucionario.com/[0-9][0-9][0-9][0-9]/[0-9][0-9]/[0-9][0-9]/', a['href']):
                        if re.search('episodio', a['href']):
                            pass
                        else:
                            ref.append(a['href'])
                #S'escriu el fitxer .csv amb totes les dades recopilades
                writer.writerow((page,date,title,nC,write_comments))
                referencies.append(ref)
            except:
                pass

Downloading: https://www.fitnessrevolucionario.com/2015/08/21/las-bestias-del-exito-01-marcos-vazquez-fitness-revolucionario-y-mario-luna/
Downloading: https://www.fitnessrevolucionario.com/2015/09/03/segunda-parte-entrevista-de-marcio-luna-a-marcos-vazquez-sobre-salud/
Downloading: https://www.fitnessrevolucionario.com/2016/04/02/cambia-tu-identidad-para-mejorar-tu-cuerpo/
Downloading: https://www.fitnessrevolucionario.com/2011/08/07/la-conspiracion-de-la-industria-del-fitness/
Downloading: https://www.fitnessrevolucionario.com/2011/09/03/la-conspiracion-de-la-industria-alimenticia-y-farmaceutica/
Downloading: https://www.fitnessrevolucionario.com/2011/08/13/el-secreto-para-bajar-de-peso/
Downloading: https://www.fitnessrevolucionario.com/2011/08/20/como-disenar-un-programa-de-entrenamiento-parte-2/
Downloading: https://www.fitnessrevolucionario.com/2011/09/04/mujeres-revolucionarias/
Downloading: https://www.fitnessrevolucionario.com/2011/09/10/deshazte-de-tus-zapatillas-de-correr/
D

Downloading: https://www.fitnessrevolucionario.com/2013/06/17/que-es-realmente-el-hambre-armas-para-destruir-los-antojos/
Downloading: https://www.fitnessrevolucionario.com/2013/06/09/levantate-y-repara-tu-cuerpo/
Downloading: https://www.fitnessrevolucionario.com/2013/05/19/aumenta-tus-niveles-de-testosterona-de-manera-natural-parte-ii/
Downloading: https://www.fitnessrevolucionario.com/2013/05/12/aumenta-tus-niveles-de-testosterona-de-manera-natural-parte-i/
Downloading: https://www.fitnessrevolucionario.com/2013/04/13/los-consensos-medicos-vs-la-ciencia-y-los-nuevos-medicos/
Downloading: https://www.fitnessrevolucionario.com/2013/03/26/la-clave-para-lograr-resultados/
Downloading: https://www.fitnessrevolucionario.com/2013/03/03/hoy-toca-patatas-y-arroz-o-cuando-debo-aumentar-mis-carbohidratos/
Downloading: https://www.fitnessrevolucionario.com/2013/02/26/soy-manada-reflexiones-para-ser-mas-felices-y-sanos-en-el-mundo-moderno/
Downloading: https://www.fitnessrevolucionario.com/2013/

Downloading: https://www.fitnessrevolucionario.com/2017/04/01/mejorar-equilibrio/
Downloading: https://www.fitnessrevolucionario.com/2017/04/05/resistencia-a-la-insulina-inflamacion-y-medicina-evolutiva-begona-ruiz/
Downloading: https://www.fitnessrevolucionario.com/2012/05/10/salta-cuerda-y-pierde-grasa/
Downloading: https://www.fitnessrevolucionario.com/2011/08/13/que-significa-fitness/
Downloading: https://www.fitnessrevolucionario.com/2017/04/11/sin-azucar-antonio-estrada/
Downloading: https://www.fitnessrevolucionario.com/2016/08/13/beneficios-de-la-sauna-rendimiento-musculo-salud-y-longevidad/
Downloading: https://www.fitnessrevolucionario.com/2015/01/18/claves-para-desarrollar-y-marcar-abdominales-parte-i/
Downloading: https://www.fitnessrevolucionario.com/2016/10/08/hipoxia-altura-oclusion-mascaras-elevacion/
Downloading: https://www.fitnessrevolucionario.com/2012/01/09/el-metodo-tabata-o-como-perder-grasa-en-4-minutos/
Downloading: https://www.fitnessrevolucionario.com/2011/09

Downloading: https://www.fitnessrevolucionario.com/2014/11/17/especial-mujeres-en-busca-de-la-regla-perdida/
Downloading: https://www.fitnessrevolucionario.com/2014/04/26/todo-sobre-la-proteina-evitando-sus-riesgos-parte-ii/
Downloading: https://www.fitnessrevolucionario.com/2016/05/21/alimentos-fermentados/
Downloading: https://www.fitnessrevolucionario.com/2017/08/19/frutas/
Downloading: https://www.fitnessrevolucionario.com/2017/11/08/abel-novoa-riesgos-medicina/
Downloading: https://www.fitnessrevolucionario.com/2015/07/05/pies-libres-mejora-tu-postura-salud-e-inteligencia/
Downloading: https://www.fitnessrevolucionario.com/2013/09/14/quieres-acelerar-tu-metabolismo-y-quemar-mas-grasa-activa-tu-grasa-parda/
Downloading: https://www.fitnessrevolucionario.com/2013/02/17/cetosis-y-desempeno-deportivo-o-impacto-de-las-dietas-bajas-en-carbohidrato-en-tu-rendimiento/
Downloading: https://www.fitnessrevolucionario.com/2017/11/15/paloma-quintana-nutricion/
Downloading: https://www.fitnessr

Downloading: https://www.fitnessrevolucionario.com/2017/03/11/meditacion-y-mindfulness/
Downloading: https://www.fitnessrevolucionario.com/2016/11/05/lesiones-no-apliques-hielo-ni-reposes/
Downloading: https://www.fitnessrevolucionario.com/2017/12/06/ejercicio-embarazo-kaisa-tuominen/
Downloading: https://www.fitnessrevolucionario.com/2018/05/12/tecnicas-hipertrofia-avanzadas/
Downloading: https://www.fitnessrevolucionario.com/2013/04/21/como-desarrollar-capacidad-explosiva/
Downloading: https://www.fitnessrevolucionario.com/2015/04/26/creatina-uno-de-los-mejores-suplementos-beneficios-usos-dosis-riesgos/
Downloading: https://www.fitnessrevolucionario.com/2015/06/04/suplemento-de-bcaa-cuando-son-utiles-y-cuando-una-perdida-de-dinero/
Downloading: https://www.fitnessrevolucionario.com/2013/04/07/lo-que-siempre-quisiste-saber-sobre-la-proteina-de-suero/
Downloading: https://www.fitnessrevolucionario.com/2018/05/26/como-fortalecer-sistema-inmune/
Downloading: https://www.fitnessrevolucion

In [17]:
##Emmagatzemar referències
import pandas as pd
rdata = pd.DataFrame(columns=["ref"])
rdata["ref"] = [r for r in referencies]
rdata.to_csv("referencies.csv",",")