<a href="https://colab.research.google.com/github/shantanu2383/Data-Science-Tools/blob/main/Article_URL_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#WAYBACK SCRAPER
#Function accesses article webpage using the Wayback Archive Machine and scrapes article text where article is available


import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def scrape_article_from_wayback_machine(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.3, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    wayback_machine_url = f"http://archive.org/wayback/available?url={url}"
    response = session.get(wayback_machine_url)

    if response.status_code == 200:
        json_response = response.json()
        snapshot_url = json_response.get("archived_snapshots", {}).get("closest", {}).get("url")

        if snapshot_url:
            snapshot_response = session.get(snapshot_url)
            if snapshot_response.status_code == 200:
                soup = BeautifulSoup(snapshot_response.content, "html.parser")
                article_element = soup.find("article")

                if article_element:
                    article_text = article_element.get_text()
                    return article_text
                else:
                    return "Article element not found in the snapshot"
            else:
                return "Failed to retrieve snapshot from Wayback Machine"
        else:
            return "Snapshot not found in Wayback Machine"
    else:
        return "Failed to query Wayback Machine API"

In [None]:
#URL SCRAPER
#Function accesses article directly using article webpage and scrapes article text where article is available

import requests
from bs4 import BeautifulSoup

def scrape_article_text(url, timeout=15):
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
    except requests.exceptions.Timeout as e:
        print(f"Timeout scraping {url}: {e}")
        return ""
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return ""

    soup = BeautifulSoup(response.content, "html.parser")
    article = soup.find("article")
    if article is not None:
        text = article.get_text()
    else:
        text = ""
    return text