In [28]:
from bs4 import BeautifulSoup  # For HTML parsing
import requests  # For making HTTP requests
import csv  # For reading and writing CSV files
import os  # For file system operations like checking paths
import time  # For pauses between requests
import pandas as pd
import re


In [7]:

# By modifying the URL to access the news, we save the new links
# the new file must end in: _approved_pattern.csv

def separate_by_pattern(file_path):

    url_pattern = "https://quotidiano.repubblica.it/edicola/searchdetail\?id=http://archivio.repubblica.extra.kataweb.it/archivio/repubblica/"

    # pattern to follow
    new_url_base = "https://ricerca.repubblica.it/repubblica/archivio/repubblica/"


    outputfolder = "output"

    # # absolute path
    # file_path = os.path.join(os.getcwd(),outputfolder, file)
    # file_path = os.path.abspath(file_path)

    # create file name
    output_file = os.path.splitext(file_path)[0] + "_approved_pattern.csv"

    if not os.path.exists(output_file):

        # Read the CSV file
        df_src = pd.read_csv(file_path)

        rows = []

        # Filter rows containing the pattern in the 'link' column
        rows_with_pattern = df_src[df_src["link"].str.contains(url_pattern, na=False)]

        # Generate new column
        # df_src["notice_link"] = ""

        for index, row in rows_with_pattern.iterrows():

            link = row["link"]

            # Quitar patr√≥n y eemplazar base
            notice_link = re.sub(url_pattern, new_url_base, link)

              # Agregar a la lista como diccionario
            rows.append({"link": notice_link})

        df_final = pd.DataFrame(rows)

        # Save the filtered rows to a new CSV file
        df_final.to_csv(output_file, index=False)

        # Create the .txt file with the details
        details_file = f"details_{os.path.basename(output_file)}.txt"
        with open(os.path.abspath(os.path.join(outputfolder,details_file)), "w") as f:
            f.write(f"Total files: {len(rows_with_pattern)}\n")
            f.write(f"Rows in original file: {len(df_src)}\n")
            f.write(f"Rows with pattern: {len(rows_with_pattern)}\n")
            f.write(f"Rows without pattern: {len(df_src) - len(rows_with_pattern)}\n")

        # Print the name of the output file
        print(f'The {os.path.basename(output_file)} file was created successfully')


        print(f'The {os.path.basename(output_file)} file was created succesfully')
        print(f"CHeck in : {output_file}")

    else:
        # Print the number of rows found
        print(f"The {os.path.basename(output_file)} file already exists.")
        print(f"CHeck in : {output_file}")

    return output_file



In [1]:
def parse(response, src_filename):
    # *** *** Parse the HTML content *** ***
    soup = BeautifulSoup(response.content, "html.parser")

    article = soup.find("article")

    title = article.find("h1").get_text(separator=" ", strip=True)
    title = title or "No title"
    # Extract the article
    content_tag = article.find("p")

    if content_tag is not None:
        content = content_tag.get_text()
    else:
        content = ""

    date = (
        article.find("aside").find("a").find("time").get_text(separator=" ", strip=True)
    )
    # *** *** End parse *** ***

    # objective data
    article_data = [
        {"title": title, "content": content, "date": date, "src_url": response.url}
    ]

    # Defines the field names
    fieldnames = ["title", "content", "date", "src_url"]

    # output name
    name_output = src_filename.split("_")[0] + "_output_notice.csv"

    # Check if output file exists
    mode = "a" if os.path.exists(name_output) else "w"
    # If file exists, open in append mode, else create (write mode)
    with open(name_output, mode, newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, extrasaction="ignore")
        # If opened in write mode (new file), write headers
        if mode == "w":
            writer.writeheader()

        # Write article data to CSV file
        # writerows takes a list of dictionaries and writes each
        # as a row, using the specified headers
        writer.writerows(article_data)


In [6]:
def scraper(filename):
    # files filtered by pattern
    approved_url_route = separate_by_pattern(filename)


    # I get the absolute path depending on the operating system.
    # outputfolder = "output"
    backup_filepath = filename.split("_")[0] + "_backup_links.csv"
    # backup_filepath = os.path.join(os.getcwd(),outputfolder, backupfile)

    if not os.path.exists(backup_filepath):
        with open(backup_filepath, 'w', newline='', encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=['notice_link'])
            writer.writeheader()

    # backup
    df_backup = pd.read_csv(backup_filepath)

    # Open the CSV file
    df = pd.read_csv(approved_url_route)

    urls_to_skip = set(df_backup.notice_link)

    # Iterate over each row of data
    for iteration, row in df.iterrows():

        # if "link" not in row or pd.isna(row["link"]):
        #     # Skip rows where notice_link does not exist or not NA
        #     continue
        link = row["link"]  # Gets the link from the 'link' column

        if link not in urls_to_skip:

            print(f'iteraton in row # {iteration}')

            # Scrape with BeautifulSoup
            response = requests.get(link)
            if response.status_code == 200:

                # Extract data from the website here
                parse(response, approved_url_route  )

                # Check if the file exists
                mode = "a" if os.path.exists(backup_filepath) else "w"
                # If the file exists, open it in append mode.
                with open(backup_filepath, 'a', newline="", encoding="utf-8") as file:
                    writer = csv.DictWriter(file, fieldnames=['notice_link'], extrasaction="ignore")
                    if mode == "w":
                        writer.writeheader()
                    # Write the news and its metadata to the file
                    writer.writerow({"notice_link":link})

            elif response.status_code == 403:

                print('status is 403\n' * 3)

                print('start the 4 minute wait')

                # Wait for 4 minutes before retrying
                for i in range(1,5):
                    time.sleep(60)
                    print(f'{i} minutes have passed')
                print('restarting scraper based on backup')
                # # fallback-based recursion
                # scraper(filename)
        else:
            print(f"row # {iteration} Link already processed, jumping...")

    print('Complete scraping')