In [3]:
import csv  # For reading and writing CSV files
import os  # For file system operations like checking paths
import requests  # For making HTTP requests
from bs4 import BeautifulSoup  # For HTML parsing
import time  # For pauses between requests


# Spider class to handle the crawling of news articles
class MySpider:
    def __init__(self, start_urls_csv):
        self.start_urls_csv = start_urls_csv
        self.visited_urls_file = self._build_visited_urls_file()

    def _build_visited_urls_file(self, use_output_suffix=False):

        # Get the base filename of the start URLs CSV file without the extension
        base_filename = os.path.splitext(os.path.basename(self.start_urls_csv))[0]

        # Remove the date part from the base filename
        base_filename_without_date = base_filename.split("_")[0]

        # Construct the filename suffix based on the use_output_suffix parameter
        if use_output_suffix:
            filename_suffix = "_urls_processed.csv"
        else:
            filename_suffix = "_urls_processed_log.csv"

        # Construct the filename for the visited URLs file
        visited_urls_filename = f"{base_filename_without_date}{filename_suffix}"
        # absolute path
        filepath = os.path.abspath(os.path.join('..', 'PreprocessedOutput', visited_urls_filename))

        return filepath

    # Method to start the requests
    def start_requests(self):

        # Check if the visited URLs file exists to see if it's the first run or a continuation after a failure
        if os.path.exists(self.visited_urls_file):
            # Open the file and create a set of visited URLs
            with open(self.visited_urls_file) as f:
                reader = csv.reader(f)
                visited_urls = {row[0] for row in reader if row}
        else:
            # If the file doesn't exist, it's the first run
            visited_urls = set()

        # Open the start URL CSV and iterate through it
        with open(self.start_urls_csv, "r", newline="") as file:
            reader = csv.DictReader(file)

            iteration = 1

            for row in reader:
                url = row["URL"]

                print(f" step02_URLprocessor, iteration number {iteration}")

                # Check the URL against the already visited ones
                if url not in visited_urls:
                    visited_urls.add(url)

                    # Make the HTTP request
                    response = requests.get(url)
                    if response.status_code == 200:
                        # Call the parse method to handle the response
                        self.parse(response)
                    elif response.status_code == 403:

                        print('status is 403')

                        print('start the 3 minute wait')

                        # Wait for 4 minutes before retrying
                        for i in range(1,4):
                            time.sleep(60)
                            print(f'{i} minutes have passed')
                iteration += 1
            print(f'Check the result: {self._build_visited_urls_file(use_output_suffix=True)}')
            print('************** End **************')


    # Method to handle the parsing of the response
    def parse(self, response):
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all article elements on the page
        articles = soup.find_all("article")

        # List to store the extracted article data
        article_data = []

        for article in articles:
            # Get the inner HTML of the title anchor tag
            anchor_html = article.find("h1").find("a")

            # Extract clean text from the title
            title = anchor_html.get_text(separator=" ", strip=True)

            # Extract the article link
            link = anchor_html["href"]

            # Extract the publication date
            aside_element = article.find("aside").find_all("a")
            date = aside_element[-1].get_text(separator=" ", strip=True)

            # Add the article data to the list
            article_data.append(
                {"title": title, "link": link, "date": date, "page_url": response.url}
            )

        # Write the extracted article data to the CSV file
        with open(self._build_visited_urls_file(use_output_suffix=True), "a", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(
                file, fieldnames=["title", "link", "date", "page_url"]
            )

            # Write the header if the file is empty
            if os.stat(self._build_visited_urls_file(use_output_suffix=True)).st_size == 0:
                writer.writeheader()

            # Write a row for each article data
            for data in article_data:
                writer.writerow(data)

        # Add the URL to the visited file
        with open(self.visited_urls_file, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([response.url])


In [4]:
# if __name__ == "__main__":
#     # Create an instance of MySpider and start the scraping process
#     spider = MySpider(start_urls_csv=r"d:\Dropbox\Anzony\repubblica\LaRepubblicaDataHarvest_Es\PreprocessedOutput\mafia_nigeriana_2023-01-01_2024-01-01.csv")
#     spider.start_requests()


 step02_URLprocessor, iteration number 1
 step02_URLprocessor, iteration number 2
 step02_URLprocessor, iteration number 3
 step02_URLprocessor, iteration number 4
 step02_URLprocessor, iteration number 5
 step02_URLprocessor, iteration number 6
 step02_URLprocessor, iteration number 7
 step02_URLprocessor, iteration number 8
 step02_URLprocessor, iteration number 9
 step02_URLprocessor, iteration number 10
 step02_URLprocessor, iteration number 11
 step02_URLprocessor, iteration number 12
 step02_URLprocessor, iteration number 13
 step02_URLprocessor, iteration number 14
 step02_URLprocessor, iteration number 15
 step02_URLprocessor, iteration number 16
 step02_URLprocessor, iteration number 17
 step02_URLprocessor, iteration number 18
 step02_URLprocessor, iteration number 19
 step02_URLprocessor, iteration number 20
 step02_URLprocessor, iteration number 21
 step02_URLprocessor, iteration number 22
 step02_URLprocessor, iteration number 23
 step02_URLprocessor, iteration number 24
 