In [1]:
import csv
import os

import requests
from bs4 import BeautifulSoup
from lxml import html
import time

def convert_Date(italian_date):
    # Mapping of Italian month names to month numbers
    italianMonths_to_Number = {
        "gennaio": "01",
        "febbraio": "02",
        "marzo": "03",
        "aprile": "04",
        "maggio": "05",
        "giugno": "06",
        "luglio": "07",
        "agosto": "08",
        "settembre": "09",
        "ottobre": "10",
        "novembre": "11",
        "dicembre": "12",
    }

    # Split the Italian date into parts
    date_parts = italian_date.split()

    # Get day, month, and year
    day = date_parts[0]
    month = italianMonths_to_Number[date_parts[1]]
    year = date_parts[2]

    # Format the date in the desired format
    formatted_date = f"{year}-{month}-{day}"

    return formatted_date

# Function to generate a CSV file with the extracted links
def create_csv(keyword, from_date, to_date, urls_list):
    # Define the output directory one level up from the current directory
    OUTPUT_DIR = os.path.join(os.getcwd(), os.pardir, "PreprocessedOutput")

    # Create output directory if it doesn't exist
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # Format the keyword to name the file
    filename = keyword.replace(" ", "_")

    # Build the CSV file path/name with dates
    csv_file_path = os.path.join(
        OUTPUT_DIR, "_".join([filename, from_date, to_date]) + ".csv"
    )

    # Create a list of dictionaries from URLs with the header "URL"
    data = [{"URL": url} for url in urls_list]

    if os.path.exists(csv_file_path):
        # The file exists, open in read mode to review content
        with open(csv_file_path, mode="r", newline="") as file:
            existing_data = list(csv.DictReader(file))

        # Check if the new data is already in the existing file
        existing_urls = set(row["URL"] for row in existing_data)
        new_urls = set(row["URL"] for row in data)
        if not new_urls.issubset(existing_urls):
            # The file exists, open in append mode
            with open(csv_file_path, mode="a", newline="") as file:
                writer = csv.DictWriter(file, fieldnames=data[0].keys())

                # Add new URL data rows to the existing file
                for row in data:
                    writer.writerow(row)

        # print(f"Data added to existing file: {csv_file_path}")
    else:
        # The file does not exist, open in write mode
        with open(csv_file_path, mode="w", newline="") as file:
            writer = csv.DictWriter(file, fieldnames=data[0].keys())

            writer.writeheader()

            # Write URL data rows
            for row in data:
                writer.writerow(row)
        print(f"New CSV file created: {csv_file_path}")

    return csv_file_path

# Extracts the total number of pages from the search results
def get_the_total_number_of_pages(url):
    try:
        # Perform a GET request and parse HTML
        page = requests.get(url)

        # Check the status code
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            root = html.fromstring(str(soup))

            # XPath to extract the total pages text
            # Handle case where there are no results
            try:
                paragraph_text = root.xpath(
                    '//*[@id="lista-risultati"]/div/p/text()[2]'
                )[0]
                total_pages = paragraph_text.split()[1]
            except:
                total_pages = 0

            # Convert to integer and return
            return int(total_pages)

        elif page.status_code == 403:
            print(
                "403 Forbidden status code encountered. Waiting 3 minutes before retrying..."
            )

            for i in range(3):
                print(f"Waiting {i+1} minute...")
                time.sleep(60)

            # After waiting, retry to get the total number of pages
            return get_the_total_number_of_pages(url)
        else:
            print(f"Unexpected status code: {page.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

# Gets the date of the most recent article result
def get_date_of_next_period(url):
    try:
        # Request page and parse HTML
        page = requests.get(url)

        # Check the status code
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            root = html.fromstring(str(soup))

            # XPath to extract the last date in the results set
            dates = root.xpath('//*[@id="lista-risultati"]/article/aside/a')[-1]
            date = dates.text_content().strip()

            # Return cleaned date string
            return date
        elif page.status_code == 403:
            print(
                "403 Forbidden status code encountered. Waiting 3 minutes before retrying..."
            )

            for i in range(3):
                print(f"Waiting {i+1} minute...")
                time.sleep(60)

            # After waiting, retry to get the date of the next period
            return get_date_of_next_period(url)
        else:
            print(f"Unexpected status code: {page.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

# Generates a list of URLs for search result pages
def urls_generator(keyword, from_date, to_date, modality, number_of_pages):
    # Format the keyword string for URL
    keyword = keyword.replace(" ", "+")

    # Base URL for search results with placeholder parameters
    #! Note: Please do not modify this construction as it will alter the desired behavior or result in a fatal error.
    url_base = "https://ricerca.repubblica.it/ricerca/repubblica?query={}&fromdate={}&todate={}&sortby=adate&author=&mode={}&page={}"

    urls = []

    # Iterate through the number of pages
    for page in range(1, number_of_pages + 1):
        # Populate base URL with parameters
        # The page number is iterated
        url = url_base.format(keyword, from_date, to_date, modality, page)

        # Add to the list of URLs
        urls.append(url)

    return urls

# Main scraping function
def scraper_one(keyword, from_date, to_date, modality):
    # Generate the initial set of search result URLs
    urls = urls_generator(keyword, from_date, to_date, modality, 1)

    # Get the total number of pages for the search
    total_number_of_pages = get_the_total_number_of_pages(urls[0])  # Example 140

    # Handle no results case
    if total_number_of_pages == 0:
        return print("No news found with the provided entry")

    # If < 50 pages, scrape all
    elif total_number_of_pages <= 50:
        urls_list = urls_generator(
            keyword, from_date, to_date, modality, total_number_of_pages
        )

        # Save URLs to CSV
        name_output = create_csv(keyword, from_date, to_date, urls_list)

        return name_output

    # If > 50 pages, scrape in batches
    else:
        max_pages_per_keyword = 50

        # First batch of 50 URLs
        urls_list = urls_generator(
            keyword, from_date, to_date, modality, max_pages_per_keyword
        )

        # Save to CSV (create)
        name_output = create_csv(keyword, from_date, to_date, urls_list)

        # Scrape in batches of 50 using date pagination
        while True:
            # Get the start date of the next period
            date_of_next_period_in_italian = get_date_of_next_period(urls_list[-1])
            date_of_next_period = convert_Date(date_of_next_period_in_italian)
            print("step01_URLgenerAtor, processing in " + date_of_next_period)

            # Generate a single URL to get the total number of pages
            urls_list = urls_generator(
                keyword, date_of_next_period, to_date, modality, 1
            )  # returns array

            # Get the total number of pages for the new date range
            total_number_of_pages = get_the_total_number_of_pages(urls_list[0])

            # Handle total pages cases
            if total_number_of_pages > 50:
                # If more than 50 pages, generate batch
                urls_list = urls_generator(
                    keyword,
                    date_of_next_period,
                    to_date,
                    modality,
                    max_pages_per_keyword,
                )

                # Save to CSV (append)
                create_csv(keyword, from_date, to_date, urls_list)

            elif total_number_of_pages <= 50:
                # If less than 50, generate all pages
                urls_list = urls_generator(
                    keyword,
                    date_of_next_period,
                    to_date,
                    modality,
                    total_number_of_pages,
                )

                # Save to CSV (append)
                create_csv(keyword, from_date, to_date, urls_list)

                print(
                    "All existing URLs for the keyword have been saved"
                )

                break
        print('**********Complete url generator***********')
        # Returns the file name for the next process
        return name_output


In [2]:
# if __name__== "__main__":
#     file_process_file = scraper_one("mafia nigeriana", '2023-01-01', "2024-01-01", "any")
#     print(file_process_file)

New CSV file created: d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest\src\..\PreprocessedOutput\mafia_nigeriana_2023-01-01_2024-01-01.csv
Processing in 2023-02-02
Data added to existing file: d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest\src\..\PreprocessedOutput\mafia_nigeriana_2023-01-01_2024-01-01.csv
Processing in 2023-03-24
Data added to existing file: d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest\src\..\PreprocessedOutput\mafia_nigeriana_2023-01-01_2024-01-01.csv
Processing in 2023-05-18
Data added to existing file: d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest\src\..\PreprocessedOutput\mafia_nigeriana_2023-01-01_2024-01-01.csv
Processing in 2023-07-09
Data added to existing file: d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest\src\..\PreprocessedOutput\mafia_nigeriana_2023-01-01_2024-01-01.csv
Processing in 2023-09-04
Data added to existing file: d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest\src\..\PreprocessedOutput\mafia_nigeriana_2023-01-