### Libraries

In [1]:
# Webscraping tools
from scrapy import Selector
import requests

# Data manipulation tools
import pandas as pd
from datetime import datetime

# Flow control tools
from tqdm import tqdm
import time

### Functions

In [2]:
def date_range(
                start_date = datetime.today().strftime( '%Y/%m/%d' ), 
                end_date = datetime.today().strftime( '%Y/%m/%d' )
                ):
    '''
    This function will give you a list of dates with the format yyyy/mm/dd.
    
    It requires as input:
        - start_date : this is the first date of the range. Please enter a sring in format "mm-dd-yyyy"
        - end_date   : this is the last date of the range. Please enter a sring in format "mm-dd-yyyy"
        
    If no input is given, it will give you only today's date. This also allows you to give only start_date
    as input and get the date range up to today's date.
    '''
    # Create a date range
    dates = pd.date_range( start = start_date, end = end_date )
    
    # Mutate to a list of strings with the format yyyy/mm/dd
    dates = dates.format( formatter = lambda date: date.strftime( '%Y/%m/%d' ) )
    
    # Return the list of dates
    return dates

In [46]:
def editorial_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you the "editorial" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/edito'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }
    
    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Create a dataframe row
    row = {
        'date' : date,
        'url' : url,
        'section' : 'editorial',
        'title' : ''.join(selector.xpath( '//article//div[@class="cabeza"]//text()' ).extract()),
        'author' : 'Editorial',
        'raw_text' :''.join(selector.xpath( '//article//div[@id="article-text"]//div//text()' ).extract())
    }  

    return row

In [47]:
def correo_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you the "correo ilustrado" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/correo'

    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Create a dataframe row
    row = {
        'date' : date,
        'url' : url,
        'section' : 'correo_ilustrado',
        'title' : ''.join(selector.xpath( '//article//div[@class="cabeza"]//text()' ).extract()),
        'author' : 'Correo Ilustrado',
        'raw_text' :''.join(selector.xpath( '//article//div[@id="article-text"]//div//text()' ).extract())
    }

    return row

In [48]:
def get_article(date, article):
    
    '''This scraper will return the article and its matadata from an specific url'''
    
    # Make article url
    article_url = 'https://www.jornada.com.mx/' + date + '/' + article
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }
    
    get_request  = requests.get( article_url, headers = headers )
    html = get_request.content
    selector = Selector( text = html )

    # Create a dataframe row
    row = {
        'date' : date,
        'url' : article_url,
        'section' : article.split('/')[0],
        'title' : ''.join(selector.xpath( '//article//div[@class="cabeza"]//text()' ).extract()),
        'author' : ''.join(selector.xpath( '//article//div[@itemprop="author"]//text()' ).extract()),
        'raw_text' :''.join(selector.xpath( '//article//div[@id="article-text"]//div//text()' ).extract())
        }
        
    return row

In [49]:
def opinion_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "opinion" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/opinion'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }
    
    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]

    return articles_df

In [50]:
def economia_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "economia" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/economia'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]

    return articles_df

In [51]:
def politica_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "politica" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/politica'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }
    
    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]

    return articles_df

In [52]:
def estados_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "estados" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/estados'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]

    return articles_df

In [53]:
def capital_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "capital" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/capital'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]
    
    return articles_df

In [54]:
def cultura_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "cultura" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/cultura'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]
    
    return articles_df

In [55]:
def espectaculos_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "espectaculos" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/espectaculos'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = pd.concat([get_article(date,article) for article in articles])

    return articles_df

In [56]:
def deportes_scraper(date = datetime.today().strftime( '%Y/%m/%d' )):
    
    '''This scraper will give you all articles in the "deportes" section from La Jornada'''
    
    # Set section url
    url = 'https://www.jornada.com.mx/' + date + '/deportes'
    
    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }

    # Make a request on the main page
    html = requests.get( url, headers = headers ).content
    selector = Selector( text = html )

    # Get articles urls
    articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))

    # Get articles into a dataframe
    articles_df = [get_article(date,article) for article in articles]

    return articles_df

### La Jornada Crawler

In [15]:
def get_news(date = datetime.today().strftime( '%Y/%m/%d' )):
    '''
    This function will give you the new published in La Jornada on the specified date
    '''
    # This is the base url
    jornada_url = 'https://www.jornada.com.mx' 
    url = jornada_url + '/' + date

    # Headers of the request
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }
    
    # Make a request on the main page
    get_request  = requests.get( url, headers = headers )

    # If the request is successful
    if get_request.status_code < 400:

        # Get html content
        html = get_request.content
        # Get selectors
        selector = Selector( text = html )
        # Get the urls for each section
        sections = ['edito','correo','opinion','politica','economia','estados','capital','cultura','espectaculos','deportes']
        # Create an empty dataframe
        dataframe = pd.DataFrame(
                                {
                                    'date' : [],
                                    'id' : [],
                                    'section' : [],
                                    'title' : [],
                                    'author' : [],
                                    'raw_text' : []
                                }
                                )

        # Iterate over each section
        for section in tqdm(sections):
            url = jornada_url + '/' + date +  '/' + section
            
            # Do a request on the main page
            html = requests.get( url, headers = headers ).content
            selector = Selector( text = html )

            # Editorial structure
            if 'edito' in section:

                # Create a dataframe row
                row = pd.DataFrame(
                    {
                        'date' : date,
                        'id' : url.replace(jornada_url,''),
                        'section' : 'editorial',
                        'title' : ''.join(selector.xpath( '//article//div[@class="cabeza"]//text()' ).extract()),
                        'author' : 'Editorial',
                        'raw_text' :''.join(selector.xpath( '//article//div[@id="article-text"]//div//text()' ).extract())
                    },
                    index =[0]
                    )

                # Append to the dataframe
                dataframe = pd.concat([dataframe, row])

            # Correo Ilustrado structure
            elif 'correo' in section:

                # Create a dataframe row
                row = pd.DataFrame(
                    {
                        'date' : date,
                        'id' : url.replace(jornada_url,''),
                        'section' : 'correo_ilustrado',
                        'title' : ''.join(selector.xpath( '//article//div[@class="cabeza"]//text()' ).extract()),
                        'author' : 'Correo Ilustrado',
                        'raw_text' :''.join(selector.xpath( '//article//div[@id="article-text"]//div//text()' ).extract())
                    },
                    index =[0]
                    )

                # Append to the dataframe
                dataframe = pd.concat([dataframe, row])

            # Cartones structure
            elif 'cartones' in section:
                # Just pass since this proyect is not about images(FOR NOW)
                pass

            # News articles structure
            else:
                # Get articles urls
                articles = list(set(selector.xpath('//div[@id="section-cont"]/div[contains(@class,"item")]//a/@href').extract()))
                # For each article
                for article in articles:
                    article_url = 'https://www.jornada.com.mx/' + date + '/' + article
                    # Check if available
                    if 'remove_article' in article_url:
                                removed_articles.append(article_url)
                    else:
                        # Maker request
                        get_request  = requests.get( article_url, headers = headers )
                        html = get_request.content
                        selector = Selector( text = html )
                        
                        # Create a dataframe row
                        row = pd.DataFrame(
                            {
                                'date' : date,
                                'id' : article_url.replace(jornada_url,''),
                                'section' : section.split('/')[-1],
                                'title' : ''.join(selector.xpath( '//article//div[@class="cabeza"]//text()' ).extract()),
                                'author' : ''.join(selector.xpath( '//article//div[@itemprop="author"]//text()' ).extract()),
                                'raw_text' :''.join(selector.xpath( '//article//div[@id="article-text"]//div//text()' ).extract())
                            },
                            index =[0]
                            )

                        # Append to the  dataframe
                        dataframe = pd.concat([dataframe, row])
        return dataframe

### Load to a database

In [None]:
import sqlite3

# Establish SQL connection
sqliteConnection = sqlite3.connect('./data/dtm_master_project.db')
cursor = sqliteConnection.cursor()

# Headers of the request
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0' }
#Get daily news
dates = date_range('01-01-2021')
for date in tqdm(dates):
    try:
        news = jornada_allcrawler(date)
        # Push to the database
        news.to_sql('raw_news',sqliteConnection, if_exists='append',index = False)
    except:
        pass
# Close connection
sqliteConnection.close()

  7%|██▍                              | 26/356 [10:59:13<161:25:52, 1761.07s/it]