In [1]:
from bs4 import BeautifulSoup  # For HTML parsing
import requests  # For making HTTP requests
import csv  # For reading and writing CSV files
import os  # For file system operations like checking paths
import time  # For pauses between requests
import pandas as pd  # For data manipulation and analysis
from multiprocessing import Pool  # For parallelizing tasks
from functools import partial

def parse_unmadchedURLs(response, title_link, date_link, src_url):

    # *** Parse HTML content ***

    soup = BeautifulSoup(response.content, "html.parser")

    try:
        container_a = soup.find("div", id="article-body")
        content_a = container_a.get_text(separator=" ", strip=True)
    except:
        content_a = "article-body not found"

    try:
        container_b = soup.find("div", id="story__summary")
        content_b = container_b.get_text(separator=" ", strip=True)
    except:
        content_b = "story__summary not found"

    try:
        container_c = soup.find("div", id="detail_summary")
        content_c = container_c.get_text(separator=" ", strip=True)
    except:
        content_c = "detail_summary not found"
    finally:
        pass

    # *** End of parsing ***

    # Target data
    article_data = [
        {
            "title": title_link,
            "article_body": content_a,
            "story__summary": content_b,
            "detail_summary": content_c,
            "date": date_link,
            "link": response.url,
            "src_url": src_url,
        }
    ]

    return article_data

def check_mode(file_name):
    mode = "a" if os.path.exists(file_name) else "w"
    return mode

def write_csv(file_name, mode, fieldnames, article_data=None):

    with open(file_name, mode, newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(
            file, fieldnames=fieldnames, extrasaction="ignore"
        )
        # If opened in write mode (new file), write headers
        if mode == "w":
            writer.writeheader()
        # Write article data to CSV file
        if article_data:
            writer.writerows(article_data)

def scraper_unmatchedURLS(row, output_filename):

    # File path for visited links log.
    dirname_rec = os.path.dirname(output_filename)
    basename_rec = os.path.basename(output_filename).split("_")[0] + "_activity_rec_unmatchedURLs.csv"
    # Get the absolute path depending on the operating system.
    activty_records_path = os.path.join(dirname_rec, basename_rec)

    mode_b = check_mode(activty_records_path)

    # Add to the scraping log
    write_csv(activty_records_path, mode_b, ["link"])

    # Read the scraping log
    df_backup = pd.read_csv(activty_records_path)

    # Read the scraping log skipping duplicates
    urls_to_skip = set(df_backup.link)

    link = row["link"]  # Gets the link from the 'link' column
    title_link = row["title"]  # Gets the text from the 'title' column
    date_link = row["date"]  # Gets the text from the 'date' column
    src_url = row["page_url"]  # Gets the text from the 'page_url' column

    if link not in urls_to_skip:
        try:
            # Crawling with BeautifulSoup
            response = requests.get(link)
            if response.status_code == 200:


                # Extract data from the website here
                article_data = parse_unmadchedURLs(
                    response, title_link, date_link, src_url
                )

                # Define field names
                fieldnames = [
                    "title",
                    "article_body",
                    "story__summary",
                    "detail_summary",
                    "date",
                    "link",
                    "src_url",
                ]

                # Crear directorio de salida si no existe
                OUTPUT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "Output"))
                if not os.path.exists(OUTPUT_DIR):
                    os.makedirs(OUTPUT_DIR)

                # Nombre del archivo de salida

                basename_output = os.path.basename(output_filename).split("_")[0] + "_output_notice_unmatchedURLs.csv"
                name_output = os.path.join(OUTPUT_DIR, basename_output)

                # Check if output file exists
                mode_a = check_mode(name_output)

                # If the file exists, open in append mode, otherwise create (write mode)
                write_csv(name_output, mode_a, fieldnames, article_data)

                links_backup = {"link": link}

                # Check if the file exists
                mode_b = check_mode(activty_records_path)

                # If the file exists, open in append mode
                write_csv(activty_records_path, mode_b, ['link'], [links_backup])


            elif response.status_code == 403:

                print("Status is 403")

                print("Starting 3-minute wait")

                # Wait 3 minutes before retrying
                for i in range(1, 4):
                    time.sleep(60)
                    print(f"{i} minutes have passed")
                print("Restarting the scraper based on backup")

            elif response.status_code == 307:
                print(link)
                print("Too many redirects")
                pass  # Continue with the next iteration of the loop

        except Exception as e:
            print(f'Check: \n{link}')
            print(f"Error: {e}")
            pass

    else:
        print(f"Row already processed, skipping...{activty_records_path}")

    print("step04_GETnotice_unmatchedURLs, Crawling complete")

def main_unmatched_URLs(input_csv):

    if os.path.exists(input_csv):

        df = pd.read_csv(input_csv)

        # Send the path
        parital_scraper = partial(scraper_unmatchedURLS, output_filename = input_csv)

        with Pool() as pool:
            pool.map(parital_scraper, df.to_dict("records"))
    else:
        print("Input CSV file not found.")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd  # For data manipulation and analysis


In [None]:

# if __name__ == "__main__":
#     main('d:\\Dropbox\\Anzony\\repubblica\\LaRepubblicaDataHarvest_Es\\PreprocessedOutput\\mafia_urls_processed_Unmatched_URLs.csv')

In [None]:

def parse_unmadchedURLs(response, title_link, date_link, src_url):
    # *** Parse HTML content ***
    soup = BeautifulSoup(response.content, "html.parser")

    try:
        container_a = soup.find("div", id="article-body")
        content_a = container_a.get_text(separator=" ", strip=True)
    except:
        content_a = "article-body not found"

    try:
        container_b = soup.find("div", id="story__summary")
        content_b = container_b.get_text(separator=" ", strip=True)
    except:
        content_b = "story__summary not found"

    try:
        container_c = soup.find("div", id="detail_summary")
        content_c = container_c.get_text(separator=" ", strip=True)
    except:
        content_c = "detail_summary not found"
    finally:
        pass

    # *** End of parsing ***

    # Target data
    article_data = [
        {
            "title": title_link,
            "article_body": content_a,
            "story__summary": content_b,
            "detail_summary": content_c,
            "date": date_link,
            "link": response.url,
            "src_url": src_url,
        }
    ]

    return article_data

def check_mode(file_name):
    mode = "a" if os.path.exists(file_name) else "w"
    return mode

def write_csv(file_name, mode, fieldnames, article_data=None):
    with open(file_name, mode, newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, extrasaction="ignore")
        # If opened in write mode (new file), write headers
        if mode == "w":
            writer.writeheader()
        # Write article data to CSV file
        if article_data:
            writer.writerows(article_data)

def scraper_unmatchedURLS(row, output_filename):
    # File path for visited links log.
    dirname_rec = os.path.dirname(output_filename)
    basename_rec = os.path.basename(output_filename).split("_")[0] + "_activity_rec_unmatchedURLs.csv"
    # Get the absolute path depending on the operating system.
    activty_records_path = os.path.join(dirname_rec, basename_rec)

    mode_b = check_mode(activty_records_path)

    # Add to the scraping log
    write_csv(activty_records_path, mode_b, ["link"])

    # Read the scraping log
    df_backup = pd.read_csv(activty_records_path)

    # Read the scraping log skipping duplicates
    urls_to_skip = set(df_backup.link)

    link = row["link"]  # Gets the link from the 'link' column
    title_link = row["title"]  # Gets the text from the 'title' column
    date_link = row["date"]  # Gets the text from the 'date' column
    src_url = row["page_url"]  # Gets the text from the 'page_url' column

    if link not in urls_to_skip:
        try:
            # Crawling with BeautifulSoup
            response = requests.get(link)
            if response.status_code == 200:
                # Extract data from the website here
                article_data = parse_unmadchedURLs(
                    response, title_link, date_link, src_url
                )

                # Define field names
                fieldnames = [
                    "title",
                    "article_body",
                    "story__summary",
                    "detail_summary",
                    "date",
                    "link",
                    "src_url",
                ]

                # Crear directorio de salida si no existe
                OUTPUT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "Output"))
                if not os.path.exists(OUTPUT_DIR):
                    os.makedirs(OUTPUT_DIR)

                # Nombre del archivo de salida
                basename_output = os.path.basename(output_filename).split("_")[0] + "_output_notice_unmatchedURLs.csv"
                name_output = os.path.join(OUTPUT_DIR, basename_output)

                # Check if output file exists
                mode_a = check_mode(name_output)

                # If the file exists, open in append mode, otherwise create (write mode)
                write_csv(name_output, mode_a, fieldnames, article_data)

                links_backup = {"link": link}

                # Check if the file exists
                mode_b = check_mode(activty_records_path)

                # If the file exists, open in append mode
                write_csv(activty_records_path, mode_b, ['link'], [links_backup])

            elif response.status_code == 403:
                print("Status is 403")
                print("Starting 3-minute wait")

                # Wait 3 minutes before retrying
                for i in range(1, 4):
                    time.sleep(60)
                    print(f"{i} minutes have passed")
                print("Restarting the scraper based on backup")

            elif response.status_code == 307:
                print(link)
                print("Too many redirects")
                pass  # Continue with the next iteration of the loop

        except Exception as e:
            print(f'Check: \n{link}')
            print(f"Error: {e}")
            pass

    else:
        print(f"Row already processed, skipping...{activty_records_path}")

    print("step04_GETnotice_unmatchedURLs, Crawling complete")

def main_unmatched_URLs_loop(input_csv):
    if os.path.exists(input_csv):
        df = pd.read_csv(input_csv)

        for index, row in df.iterrows():
            scraper_unmatchedURLS(row, input_csv)
    else:
        print("Input CSV file not found.")
