# Transcripciones

In [4]:
from requests_html import HTMLSession
from datetime import datetime
import locale
import os

In [5]:
locale.setlocale(locale.LC_TIME, "es_ES.UTF-8")

'es_ES.UTF-8'

In [6]:
url = "https://lopezobrador.org.mx/transcripciones"

In [7]:
with HTMLSession() as sess:
    r = sess.get(url)
r

<Response [200]>

In [8]:
articles = r.html.find("article")
article, *rest_articles = articles

* Título
* Fecha
* Url

In [9]:
article

<Element 'article' class=('col-md-4', 'post-125704', 'post', 'type-post', 'status-publish', 'format-standard', 'has-post-thumbnail', 'hentry', 'category-audio', 'category-boletines', 'category-comunicados', 'category-version-estenografica', 'category-videos', 'tag-amlo', 'tag-amlo-conferencia', 'tag-amlo-hoy', 'tag-amlo-presidente', 'tag-andres-manuel', 'tag-andres-manuel-lopez-obrador', 'tag-conferencia', 'tag-conferencia-amlo', 'tag-conferencia-de-amlo', 'tag-conferencia-de-amlo-hoy', 'tag-conferencia-de-prensa', 'tag-conferencia-de-prensa-amlo', 'tag-conferencia-de-prensa-presidente', 'tag-conferencia-matutina', 'tag-conferencia-presidente', 'tag-lopez-obrador', 'tag-obrador') id='post-125704'>

In [10]:
qtitle = "h2[class='entry-title']"

In [11]:
title_element = article.find(qtitle, first=True)
title_element

<Element 'h2' class=('entry-title',)>

In [12]:
title_element.text

'Versión estenográfica de la conferencia de prensa matutina del presidente Andrés Manuel López Obrador'

In [13]:
title_url, *_ = title_element.absolute_links

In [14]:
qdate = "span[class='entry-date']"

In [15]:
date_element = article.find(qdate, first=True)

In [16]:
datetime.now().strftime("%B %d, %Y")

'diciembre 11, 2019'

In [17]:
fmt = "%B %d, %Y"
datetime.now().strftime(fmt)

'diciembre 11, 2019'

In [18]:
datetime.strptime(date_element.text, fmt)

datetime.datetime(2019, 12, 11, 0, 0)

In [19]:
def extract_info(article):
    """
    Funcion que regresa en diccionario
    la información de un art.
    """
    qtitle = "h2[class='entry-title']"
    qdate = "span[class='entry-date']"
    fmt = "%B %d, %Y"
    
    title_element = article.find(qtitle, first=True)
    title_text = title_element.text
    title_url, *_ = title_element.absolute_links
    
    date_element = article.find(qdate, first=True).text
    date = datetime.strptime(date_element, fmt)
    return {
        "title": title_text,
        "url": title_url,
        "date": date
    }

def get_articles(page, sess):
    """
    Me regresa articulos
    """
    npage = str(page)
    url = "https://lopezobrador.org.mx/transcripciones"
    url = os.path.join(url, "page", npage)
    r = sess.get(url)
    articles = r.html.find("article")
    return articles

In [20]:
from tqdm.notebook import tqdm

In [21]:
total_info  = []
with HTMLSession() as sess:
    for i in tqdm(range(1, 11)):
        articles = get_articles(i, sess)
        for article in articles:
            info = extract_info(article)
            total_info.append(info)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [22]:
import pandas as pd
info_df = pd.DataFrame(total_info)

In [23]:
info_df.to_pickle("amlo.pkl")

In [3]:
import pickle

with open('amlo.pkl', 'rb') as p_f:
    data = pickle.load(p_f)
    
data.head()

Unnamed: 0,title,url,date
0,Versión estenográfica. Ceremonia de entrega de...,https://lopezobrador.org.mx/2019/12/09/version...,2019-12-09
1,Versión estenográfica de la conferencia de pre...,https://lopezobrador.org.mx/2019/12/09/version...,2019-12-09
2,Versión estenográfica de la conferencia de pre...,https://lopezobrador.org.mx/2019/12/06/version...,2019-12-06
3,Versión estenográfica de la conferencia de pre...,https://lopezobrador.org.mx/2019/12/05/version...,2019-12-05
4,Versión estenográfica de la conferencia de pre...,https://lopezobrador.org.mx/2019/12/04/version...,2019-12-04
