In [None]:
# This script uses the following libraries:
# - requests: for making HTTP requests (pip install requests)
# - BeautifulSoup (bs4): for parsing HTML (pip install beautifulsoup4)
# - lxml: for parsing HTML with BeautifulSoup (pip install lxml)
# - csv: for reading and writing CSV files (part of Python's standard library)
# - os: for interacting with the operating system (part of Python's standard library)

# To install the necessary libraries, run the following commands in your terminal or command prompt:
# pip install requests
# pip install beautifulsoup4
# pip install lxml


<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: left;">
    <h2>convert_Date</h2>
    <p>Translates Italian dates into the required URL format.</p>
  </div>
  <div style="flex: 1; display: flex; justify-content: flex-end;">
    <img src="images/date_italian.png" style="max-width: 60%; max-height: 80%; margin-left: auto; margin-right: auto;" alt="Date Italian">
  </div>
</div>

---

### Function Description

Translate Italian dates into the format needed for URLs.

**Parameters:**
- *italian_date* (str): Italian date string to be translated.

**Returns:** Translated date string in the format "YYYY-MM-DD".

The function maps Italian month names to month numbers, extracts day, month, and year from the Italian date, and formats it into the desired format.


In [None]:
def convert_Date(italian_date):
# Mapping Italian month names to month numbers
    meses_italiano_a_numero = {
        'gennaio': '01',
        'febbraio': '02',
        'marzo': '03',
        'aprile': '04',
        'maggio': '05',
        'giugno': '06',
        'luglio': '07',
        'agosto': '08',
        'settembre': '09',
        'ottobre': '10',
        'novembre': '11',
        'dicembre': '12'
    }

    # Split the Italian date into parts
    partes_fecha = italian_date.split()

    # Get day, month and year
    dia = partes_fecha[0]
    mes = meses_italiano_a_numero[partes_fecha[1]]
    año = partes_fecha[2]

    #format the date in the desired format
    fecha_formateada = f"{año}-{mes}-{dia}"

    return fecha_formateada

<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: left;">
    <h2>create_csv</h2>
    <p>Creates CSV files named after the search term. It contains all scraped links for later use.</p>
  </div>
  <div style="flex: 1; display: flex; justify-content: flex-end;">
    <img src="images/csvs.png" style="max-width: 100%; max-height: 100%; margin-left: auto; margin-right: auto;" alt="CSVs">
  </div>
</div>

---

### Function Description

Create or add scraped links related to a specific search to a CSV file.

**Parameters:**
- *keyword* (str): Search term for the CSV file.
- *from_date* (str): Start date of the search.
- *to_date* (str): End date of the search.
- *the_urls* (list): List of scraped links.

**Returns:** None

Generates a CSV file named after the search term and specified period. If the file exists, new links are added. If not, a new one is created. Links are written with a "URL" column.


In [None]:
import csv
import os

# Function to generate a CSV file with the scraped links
def create_csv(keyword, from_date, to_date, the_urls):

    OUTPUT_DIR = 'output'

    # Create output dir if it doesn't exist
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # Format keyword for file naming
    keyword = keyword.replace(" ", "_")

    # Construct CSV file path/name with dates
    csv_file_path = os.path.join(OUTPUT_DIR, '_'.join([keyword, from_date, to_date]) + '.csv')


    # Create list of dicts from urls with header "URL"
    data = [{"URL": url} for url in the_urls]

    if os.path.exists(csv_file_path):

        # File exists, open in reading mode to review content
        with open(csv_file_path, mode='r', newline='') as file:
            existing_data = list(csv.DictReader(file))

        # Check if the new data is already in the existing file
        existing_urls = set(row["URL"] for row in existing_data)
        new_urls = set(row["URL"] for row in data)
        if new_urls.issubset(existing_urls):
            print("The new data already exists in the file.")
        else:

            # File exists, open in append mode
            with open(csv_file_path, mode='a', newline='') as file:

                writer = csv.DictWriter(file, fieldnames=data[0].keys())

                # Append new url data rows to existing file
                for row in data:
                    writer.writerow(row)

        print(f"Data added to existing file: {csv_file_path}")
    else:
        # File does not exist, open in write mode
        with open(csv_file_path, mode='w', newline='') as file:

            writer = csv.DictWriter(file, fieldnames=data[0].keys())

            writer.writeheader()

            # Write url data rows
            for row in data:
                writer.writerow(row)
        print(f"New CSV file created: {csv_file_path}")

    return csv_file_path

###

<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: left;">
    <h2>Get Total Pages</h2>
    <p>The `get_the_total_number_of_pages` function retrieves the total number of pages from the search results. In the main function, this count of pages determines whether iterations should continue, depending on whether there are more than 50 pages to process.</p>
    <img src="images/totalpages.png" style="max-width: 50%; max-height: 80%; margin-left: auto; margin-right: auto;" alt="Total Pages">
  </div>
  <div style="flex: 1; text-align: left;">
    <h2>Get Next Period Date</h2>
    <p>The `get_date_of_next_period` function retrieves the date of the most recent article in the search results. This date is used to determine the period for the next iteration of the scraper. With each iteration, the scraper gradually approaches the specified end date.</p>
    <img src="images/getdate.png" style="max-width: 50%; max-height: 80%; margin-left: auto; margin-right: auto;" alt="Next Period Date">
  </div>
</div>

---


In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import html
import time

# Scrapes total number of pages from search results
def get_the_total_number_of_pages(url):
    try:
        # Issue GET request and parse HTML
        page = requests.get(url)

        # Check status code
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            root = html.fromstring(str(soup))

            # XPath to extract total pages text
            # Handles case where no results exist
            try:
                paragraph_text = root.xpath('//*[@id="lista-risultati"]/div/p/text()[2]')[0]
                total_pages = paragraph_text.split()[1]
            except:
                total_pages = 0

            # Convert to integer and return
            return int(total_pages)

        elif page.status_code == 403:
            print("403 Forbidden status code encountered. Waiting for 4 minutes before retrying...")

            for i in range(4):
                print(f"Waiting for {i+1} minute...")
                time.sleep(60)

            # After waiting, retry to get the total number of pages
            return get_the_total_number_of_pages(url)
        else:
            print(f"Unexpected status code: {page.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


# Gets date of the most recent article result
def get_date_of_next_period(url):
    try:
        # Request page and parse HTML
        page = requests.get(url)

        # Check status code
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            root = html.fromstring(str(soup))

            # XPath to extract last date in results set
            dates = root.xpath('//*[@id="lista-risultati"]/article/aside/a')[-1]
            date = dates.text_content().strip()

            # Return cleaned date string
            return date
        elif page.status_code == 403:
            print("403 Forbidden status code encountered. Waiting for 4 minutes before retrying...")

            for i in range(4):
                print(f"Waiting for {i+1} minute...")
                time.sleep(60)

            # After waiting, retry to get the total number of pages
            return get_date_of_next_period(url)
        else:
            print(f"Unexpected status code: {page.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: left;">
    <h2>Generate Search Result Page URLs</h2>
    <p>The `urls_generator` function generates a list of URLs for search result pages based on the specified parameters. These URLs are used to perform the search and retrieve the results for scraping.</p>
    <img src="images/urlsgenerator.png" style="max-width: 80%; max-height: 80%; margin-left: auto; margin-right: auto;" alt="URLs Generator">
  </div>
</div>

---

### Function Description

Generate a list of search result page URLs.

**Parameters:**
- *keyword* (str): The search term to be used in the URL.
- *from_date* (str): The start date of the search period.
- *to_date* (str): The end date of the search period.
- *modality* (str): The mode of search (e.g., "any", "all", "exact").
- *number_of_pages* (int): The total number of pages to generate URLs for.

**Returns:** 
- *urls* (list): A list of URLs for search result pages.

The function iterates through the specified number of pages and constructs the URL for each page based on the provided parameters. It formats the keyword string for the URL and uses placeholders for the date range, modality, and page number. The generated URLs are appended to a list and returned.


In [None]:
# Generates list of search result page URLs
def urls_generator(keyword, from_date, to_date, modality, number_of_pages):

    # Format keyword string for URL
    keyword = keyword.replace(" ", "+")

    # Base search results URL with placeholder params
    #! Note: Please do not modify this construction as it will alter the desired behavior or result in a fatal error.
    url_base = "https://ricerca.repubblica.it/ricerca/repubblica?query={}&fromdate={}&todate={}&sortby=adate&author=&mode={}&page={}"

    urls = []

    # Iterate through number of pages
    for page in range(1, number_of_pages + 1):

        # Populate base URL with parameters
        # Page number is iterated
        url = url_base.format(keyword, from_date, to_date, modality, page)

        # Append to list of URLs
        urls.append(url)

    return urls

<div style="display: flex; align-items: center; justify-content: center;">
  <div style="flex: 1; text-align: left;">
    <h2>Scraper</h2>
    <p>The `scraper` function orchestrates the scraping process based on the provided keyword, date range, and modality.</p>
  </div>
</div>

---

### Function Description

The `scraper` function manages the scraping process by coordinating the following operations:

- **Keyword**: The search term used to scrape relevant content.
- **Date Range**: The period over which the scraping is performed, specified by the `from_date` and `to_date` parameters.
- **Modality**: The mode of scraping, which can include web scraping or API scraping.

The function begins by generating the initial set of search result URLs using the `urls_generator` function. It then determines the total number of pages for the search using the `get_the_total_number_of_pages` function.

Depending on the total number of pages:
- If the total number of pages is less than or equal to 50, the function scrapes all the URLs and saves them to a CSV file using the `create_csv` function.
- If the total number of pages is greater than 50, the function scrapes the URLs in batches of 50. It iterates through date pagination, generating URLs for each batch and saving them to the CSV file.

Upon completion, the function provides feedback indicating the successful completion of the scraping process.



In [None]:
import csv
import os


# Main scraping function
def scraper_one(keyword, from_date, to_date, modality):

    # Generate initial set of search result URLs
    urls = urls_generator(keyword, from_date, to_date, modality, 1)

    # Get total pages for the search
    total_number_of_pages = get_the_total_number_of_pages(urls[0])  # Example 140



    # Handle no results case
    if total_number_of_pages == 0:
        return print("there is no news with the entry entered")

    # If < 50 pages, scrape all
    elif total_number_of_pages <= 50:
        the_urls = urls_generator(
            keyword, from_date, to_date, modality, total_number_of_pages
        )

        # Save URLs to CSV
        name_output = create_csv(keyword, from_date, to_date, the_urls)

        return name_output

    # If > 50 pages, scrape in batches
    else:
        max_pages_per_keyword = 50

        # First batch of 50 URLs
        the_urls = urls_generator(
            keyword, from_date, to_date, modality, max_pages_per_keyword
        )

        # Save CSV (create)
        name_output = create_csv(keyword, from_date, to_date, the_urls)

        # Scrape in batches of 50 using date pagination
        while True:

            # Get next period start date
            date_of_next_period_in_italian = get_date_of_next_period(the_urls[-1])
            date_of_next_period = convert_Date(date_of_next_period_in_italian)
            print("procces in " + date_of_next_period)

            # Generate single URL to get total pages
            the_urls = urls_generator(keyword, date_of_next_period, to_date, modality, 1)  # return array

            # Get total pages for new date period range
            total_number_of_pages = get_the_total_number_of_pages(the_urls[0])

            # Handle total pages cases
            if total_number_of_pages > 50:

                # If more than 50 pages, generate batch
                the_urls = urls_generator(keyword, date_of_next_period, to_date, modality, max_pages_per_keyword)

                # Save to CSV(append)
                create_csv(keyword, from_date, to_date, the_urls)

            elif total_number_of_pages <= 50:

                # If less than 50, generate all pages
                the_urls = urls_generator(keyword, date_of_next_period, to_date, modality, total_number_of_pages)

                # Save to CSV(append)
                create_csv(keyword, from_date, to_date, the_urls)

                print(
                    "*********** All existing urls of the keyword were saved END ***********"
                )
                print(
                    "*********** All existing urls of the keyword were saved END ***********"
                )
                print(
                    "*********** All existing urls of the keyword were saved END ***********"
                )

                break
        # returns the file name for the second process
        return name_output


  <body>
    <div style="display: flex; align-items: center; justify-content: center">
      <div style="flex: 1; text-align: left">
        <h2>scraper</h2>
        <p>
          The `scraper` function performs scraping operations based on the
          provided parameters:
        </p>
        <ul>
          <li>
            <strong>keyword</strong>: The search term used to scrape relevant
            content. In this case, the keyword is "pizza".
          </li>
          <li>
            <strong>from_date</strong>: The starting date of the search period.
            In the example, it is set to January 1, 1984.
          </li>
          <li>
            <strong>to_date</strong>: The ending date of the search period.
            Here, it is set to January 1, 2024.
          </li>
          <li>
            <strong>modality</strong>: The mode of scraping. It can be any
            specified modality, such as "any".
          </li>
        </ul>
      </div>
    </div>
    Return The name of the file where scraped data is stored.
  </body>

In [None]:
# file_process_file = scraper("sopa", '2015-01-01', "2020-01-01", "any")
# print(file_process_file)