<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: center;">
    <h2>Convert_date</h2>
    <p>Is created to translate the Italian dates that we need into the format that the URL needs</p>
  </div>
  <div style="flex: 1; display: flex; justify-content: flex-end;">
    <img src="images/date_italian.png" style="max-width: 80%; max-height: 80%; margin-left: auto; margin-right: auto;" alt="Date Italian">
  </div>
</div>


In [None]:
def convert_Date(italian_date):
# Mapping Italian month names to month numbers
    meses_italiano_a_numero = {
        'gennaio': '01',
        'febbraio': '02',
        'marzo': '03',
        'aprile': '04',
        'maggio': '05',
        'giugno': '06',
        'luglio': '07',
        'agosto': '08',
        'settembre': '09',
        'ottobre': '10',
        'novembre': '11',
        'dicembre': '12'
    }

    # Split the Italian date into parts
    partes_fecha = italian_date.split()

    # Get day, month and year
    dia = partes_fecha[0]
    mes = meses_italiano_a_numero[partes_fecha[1]]
    año = partes_fecha[2]

    #format the date in the desired format
    fecha_formateada = f"{año}-{mes}-{dia}"

    return fecha_formateada

<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: center;">
    <h2>create_csv</h2>
    <p>Is responsible for creating CSV files with names according to the search entered in the main function, in this file will be all the links that will be scraped later</p>
  </div>
  <div style="flex: 1; display: flex; justify-content: flex-end;">
    <img src="images/csvs.png" style="max-width: 100%; max-height: 100%; margin-left: auto; margin-right: auto;" alt="CSVs">
  </div>
</div>


In [1]:
import csv
import os

# Function to generate a CSV file with the scraped links
def create_csv(keyword, from_date, to_date, the_urls):

    # Format keyword for file naming
    keyword = keyword.replace(" ", "_")

    # Construct CSV file path/name with dates
    csv_file_path = '_'.join([keyword, from_date, to_date]) + '.csv'

    # Create list of dicts from urls with header "URL"
    data = [{"URL": url} for url in the_urls]

    if os.path.exists(csv_file_path):

        # File exists, open in append mode
        with open(csv_file_path, mode='a', newline='') as file:

            writer = csv.DictWriter(file, fieldnames=data[0].keys())

            # Append new url data rows to existing file
            for row in data:
                writer.writerow(row)

        print(f"Data added to existing file: {csv_file_path}")
    else:
        # File does not exist, open in write mode
        with open(csv_file_path, mode='w', newline='') as file:

            writer = csv.DictWriter(file, fieldnames=data[0].keys())
   
            writer.writeheader()

            # Write url data rows
            for row in data:
                writer.writerow(row)

        print(f"New CSV file created: {csv_file_path}")

###

In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import html


# Scrapes total number of pages from search results
def get_the_total_number_of_pages(url):

    # Issue GET request and parse HTML
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    root = html.fromstring(str(soup))

    # XPath to extract total pages text
    # Handles case where no results exist
    try:
        paragraph_text = root.xpath('//*[@id="lista-risultati"]/div/p/text()[2]')[0]
        total_pages = paragraph_text.split()[1]
    except:
        total_pages = 0

    # Convert to integer and return
    return int(total_pages)


# Gets date of the most recent article result
def get_date_of_next_period(url):

    # Request page and parse HTML
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    root = html.fromstring(str(soup))

    # XPath to extract last date in results set
    dates = root.xpath('//*[@id="lista-risultati"]/article/aside/a')[-1]
    date = dates.text_content().strip()

    # Return cleaned date string
    return date

In [13]:
# Generates list of search result page URLs
def urls_generator(keyword, from_date, to_date, modality, number_of_pages):

    # Format keyword string for URL
    keyword = keyword.replace(" ", "+")

    # Base search results URL with placeholder params
    url_base = "https://ricerca.repubblica.it/ricerca/repubblica?query={}&fromdate={}&todate={}&sortby=adate&author=&mode={}&page={}"

    urls = []

    # Iterate through number of pages
    for page in range(1, number_of_pages + 1):

        # Populate base URL with parameters
        # Page number is iterated
        url = url_base.format(keyword, from_date, to_date, modality, page)

        # Append to list of URLs
        urls.append(url)

    return urls

In [22]:
import csv
import os


# Main scraping function
def scraper(keyword, from_date, to_date, modality):

    # Generate initial set of search result URLs
    urls = urls_generator(keyword, from_date, to_date, modality, 1)

    # Get total pages for the search
    total_number_of_pages = get_the_total_number_of_pages(urls[0])  # Example 140

    # Handle no results case
    if total_number_of_pages == 0:
        return print("there is no news with the entry entered")

    # If < 50 pages, scrape all
    elif total_number_of_pages <= 50:
        the_urls = urls_generator(
            keyword, from_date, to_date, modality, total_number_of_pages
        )

        # Save URLs to CSV
        create_csv(keyword, from_date, to_date, the_urls)

    # If > 50 pages, scrape in batches
    else:
        max_pages_per_keyword = 50

        # First batch of 50 URLs
        the_urls = urls_generator(
            keyword, from_date, to_date, modality, max_pages_per_keyword
        )

        # Save CSV
        create_csv(keyword, from_date, to_date, the_urls)

        # Scrape in batches of 50 using date pagination
        while True:

            # Get next period start date
            date_of_next_period_in_italian = get_date_of_next_period(the_urls[-1])
            date_of_next_period = convert_Date(date_of_next_period_in_italian)
            print("procces in " + date_of_next_period)

            # Generate single URL to get total pages
            the_urls = urls_generator(keyword, date_of_next_period, to_date, modality, 1)  # return array

            # Get total pages for new date period range
            total_number_of_pages = get_the_total_number_of_pages(the_urls[0])

            # Handle total pages cases
            if total_number_of_pages > 50:

                # If more than 50 pages, generate batch
                the_urls = urls_generator(keyword, date_of_next_period, to_date, modality, max_pages_per_keyword)

                create_csv(keyword, from_date, to_date, the_urls)

            elif total_number_of_pages <= 50:

                # If less than 50, generate all pages
                the_urls = urls_generator(keyword, date_of_next_period, to_date, modality, total_number_of_pages)

                create_csv(keyword, from_date, to_date, the_urls)

                print(
                    "*********** All existing urls of the keyword were saved END ***********"
                )
                print(
                    "*********** All existing urls of the keyword were saved END ***********"
                )
                print(
                    "*********** All existing urls of the keyword were saved END ***********"
                )

                break

In [23]:
scraper("mafia nigeriana", '2023-05-08', "2024-01-01", "any")

Nuevo archivo CSV creado: mafia_nigeriana_2023-05-08_2024-01-01.csv
procces in 2023-06-30
Data added to existing file: mafia_nigeriana_2023-05-08_2024-01-01.csv
procces in 2023-08-23
Data added to existing file: mafia_nigeriana_2023-05-08_2024-01-01.csv
procces in 2023-11-16
Data added to existing file: mafia_nigeriana_2023-05-08_2024-01-01.csv
*********** All existing urls of the keyword were saved END ***********
*********** All existing urls of the keyword were saved END ***********
*********** All existing urls of the keyword were saved END ***********


In [21]:
convert_Date('08 maggio 2023')

'2023-05-08'