In [None]:
from bs4 import BeautifulSoup  # For HTML analysis
import requests  # To make HTTP requests
import csv  # To read and write CSV files
import os  # For file system operations like checking paths
import time  # For pauses between requests
import pandas as pd  # For data manipulation and analysis
import re  # For regular expression operations


# Modifying the URL to access the news, we save the new links
# the new file must end in: _approved_pattern.csv

def separate_by_pattern(file_path):

    url_pattern = "https://quotidiano.repubblica.it/edicola/searchdetail\?id=http://archivio.repubblica.extra.kataweb.it/archivio/repubblica/"

    # Pattern to follow
    new_url_base = "https://ricerca.repubblica.it/repubblica/archivio/repubblica/"

    # Create file name that contains urls with the pattern
    matched_urls_csv = os.path.splitext(file_path)[0] + "_Matched_URLs.csv"

    # Create file name that contains urls without the pattern
    unmatched_urls_csv = os.path.splitext(file_path)[0] + "_Unmatched_URLs.csv"

    if not os.path.exists(matched_urls_csv):

        # Read the CSV file
        df_src = pd.read_csv(file_path)

        # Filter rows that do not contain the pattern in the 'link' column
        rows_without_pattern = df_src[~df_src["link"].str.contains(url_pattern, na=False)]

        if len(rows_without_pattern) > 0:
            rows_without_pattern.to_csv(unmatched_urls_csv, index=False)
            print(f'The file {os.path.basename(unmatched_urls_csv)} was created successfully')
            print(f"Check in: {unmatched_urls_csv}")
        else:
            rows_without_pattern = None

        # Filter rows that contain the pattern in the 'link' column
        rows_with_pattern = df_src[df_src["link"].str.contains(url_pattern, na=False)]

        # Generate new column
        matched = []
        for link in rows_with_pattern['link']:

            # Remove pattern and replace base
            notice_link = re.sub(url_pattern, new_url_base, link)

            # Add to the list as a dictionary
            matched.append({"link": notice_link})

        df_final = pd.DataFrame(matched)

        # Save the filtered rows in a new CSV file
        df_final.to_csv(matched_urls_csv, index=False)

        # Print the name of the output file
        print(f'The file {os.path.basename(matched_urls_csv)} was created successfully')
        print(f"Check in: {matched_urls_csv}")

    else:
        # Print the number of rows found
        print(f"The file {os.path.basename(matched_urls_csv)} already exists.")
        print(f"Check in: {matched_urls_csv}")

    return matched_urls_csv, unmatched_urls_csv


def parse(response, src_filename):
    # *** Analyze HTML content ***
    soup = BeautifulSoup(response.content, "html.parser")
    try:
        article = soup.find("article")
        title_h1 = article.find("h1")
        if title_h1 is not None:
            title = title_h1.get_text(separator=" ", strip=True) or 'No title'
    except:
        title = "No title"
    try:
        article = soup.find("article")
        content_tag = article.find("p")

        if content_tag is not None:
            content = content_tag.get_text(separator=" ", strip=True)
        else:
            content = ""
    except:
        content = ""

    date = (
        article.find("aside").find("a").find("time").get_text(separator=" ", strip=True)
    )
    # *** End of analysis ***

    # Target data
    article_data = [
        {"title": title, "content": content, "date": date, "src_url": response.url}
    ]

    # Define field names
    fieldnames = ["title", "content", "date", "src_url"]

    # Create output directory if it does not exist
    OUTPUT_DIR = os.path.join(os.getcwd(), os.pardir, "Output")
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # File path for logging visited links
    basename = os.path.basename(src_filename).split("_")[0] + "_output_notice.csv"
    output = os.path.join(OUTPUT_DIR, basename)

    # Check if the output file exists
    mode = "a" if os.path.exists(output) else "w"
    # If the file exists, open in append mode, otherwise create (write mode)

    with open(output, mode, newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, extrasaction="ignore")

        if mode == "w":
            writer.writeheader()

        writer.writerows(article_data)

def scraper(filename):
    # Files filtered by pattern
    matchedURL_csv, unmatchedURL_csv = separate_by_pattern(filename)

    # File path for logging visited links
    dirname = os.path.dirname(filename)
    basename = os.path.basename(filename).split("_")[0] + "_scraper_activity_record.csv"
    records_filepath = os.path.join(dirname, basename)

    if not os.path.exists(records_filepath):
        with open(records_filepath, 'w', newline='', encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=['notice_link'])
            writer.writeheader()

    # Backup
    df_records = pd.read_csv(records_filepath)

    # Read the CSV file containing the links that matched the pattern
    df = pd.read_csv(matchedURL_csv)

    urls_to_skip = set(df_records.notice_link)

    # Iterate over each row of data
    for iteration, row in df.iterrows():
        link = row["link"]  # Get the link from the 'link' column

        if link not in urls_to_skip:
            print(f'step03_GETnotice, iteration on row # {iteration}')

            # Scrape with BeautifulSoup
            response = requests.get(link)
            if response.status_code == 200:

                # Extract data from the website here
                parse(response, matchedURL_csv)

                # Check if the file exists
                mode = "a" if os.path.exists(records_filepath) else "w"
                # If the file exists, open in append mode
                with open(records_filepath, 'a', newline="", encoding="utf-8") as file:
                    writer = csv.DictWriter(file, fieldnames=['notice_link'], extrasaction="ignore")
                    if mode == "w":
                        writer.writeheader()
                    # Write the news and its metadata in the file
                    writer.writerow({"notice_link": link})

            elif response.status_code == 403:
                print('The status is 403\n' * 3)
                print('Starting the 4-minute wait')

                # Wait 4 minutes before retrying
                for i in range(1, 5):
                    time.sleep(60)
                    print(f'{i} minutes have passed')
                print('Restarting the scraper based on the backup')
        else:
            print(f"step03_GETnotice Row # {iteration} Link already processed, skipping...")

    print('Scraping complete')
    if unmatchedURL_csv is not None:
        print(f'Urls that did not match the pattern are in {unmatchedURL_csv}')
        print('Starting step04_GETnotice_unmatchedURLs')
    return unmatchedURL_csv
