In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# change the working directory to the Drive root
%cd /content/drive/My\ Drive/Colab\ Notebooks/nlp

/content/drive/My Drive/Colab Notebooks/nlp


# Parsing PDF and text extraction

In [None]:
# Install libraries
!pip install pandas PyPDF2 python-dotenv requests PyMuPDF langchain langchain-community pdf2image pillow pytesseract
!apt-get install tesseract-ocr

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-c

In [None]:
import os
import PyPDF2
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# Ensure Tesseract is in your system's PATH or specify the executable path
# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'

def crop_image(element, pageObj):
    """
    Crop an image from a PDF page based on the element's coordinates.
    """
    # Get the coordinates to crop the image from the PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0, element.y0, element.x1, element.y1]

    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)

    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    cropped_pdf_path = 'cropped_image.pdf'

    with open(cropped_pdf_path, 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

    return cropped_pdf_path

def convert_to_images(input_file):
    """
    Convert a PDF file to images.
    """
    images = convert_from_path(input_file, dpi=500)
    return images

def image_to_text(image_path):
    """
    Extract text from an image using OCR.
    """
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

def process_pdf(file_path, output_dir):
    """
    Process a PDF file to extract text and images.
    """
    try:
        with open(file_path, 'rb') as pdfFileObj:
            pdfReaded = PyPDF2.PdfReader(pdfFileObj)
            doc = fitz.open(file_path)
            n = doc.page_count
            doc_content = ""

            for i in range(n):
                page_n = doc.load_page(i)
                page_content = page_n.get_text("blocks")

                for element in page_content:
                    if element[6] == 0:  # If the element is text
                        doc_content += element[4]
                    else:  # If the element is an image
                        pageObj = pdfReaded.pages[i]
                        cropped_pdf_path = crop_image(element, pageObj)
                        images = convert_to_images(cropped_pdf_path)
                        if images:
                            images[0].save('temp_image.png', "PNG")
                            image_text = image_to_text('temp_image.png')
                            doc_content += image_text

                doc_content += "\n"

            # Save the extracted content to a text file
            txt_file = os.path.join(output_dir, os.path.basename(file_path).replace('.pdf', '.txt'))
            with open(txt_file, 'w', encoding='utf-8') as file:
                file.write(doc_content)
            print(f"Saved file: {txt_file}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

def main():
    data_path = "./data/raw/pdf_data"  # Path to the folder where PDF files are available
    output_dir = "./data/scraped/scraped_pdf_text_data"

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Process each PDF file in the directory
    for file in os.listdir(data_path):
        if file.endswith(".pdf"):
            file_path = os.path.join(data_path, file)
            print(f"Processing file: {file_path}")
            process_pdf(file_path, output_dir)

if __name__ == "__main__":
    main()


Processing file: ./data/raw/pdf_data/18398_Annual_Comprehensive_Financial_Report_December_31,_2021.pdf
Saved file: ./data/scraped/scraped_pdf_text_data/18398_Annual_Comprehensive_Financial_Report_December_31,_2021.txt
Processing file: ./data/raw/pdf_data/14147_Pittsburgh_CAR_PDF_Copy.pdf
Saved file: ./data/scraped/scraped_pdf_text_data/14147_Pittsburgh_CAR_PDF_Copy.txt
Processing file: ./data/raw/pdf_data/24770_Pittsburgh,_City_of_ACFR_FINAL_2023.pdf
Saved file: ./data/scraped/scraped_pdf_text_data/24770_Pittsburgh,_City_of_ACFR_FINAL_2023.txt
Processing file: ./data/raw/pdf_data/9623_ISP_Tax_Regulations.pdf
Saved file: ./data/scraped/scraped_pdf_text_data/9623_ISP_Tax_Regulations.txt
Processing file: ./data/raw/pdf_data/23255_2024_Operating_Budget.pdf
Saved file: ./data/scraped/scraped_pdf_text_data/23255_2024_Operating_Budget.txt
Processing file: ./data/raw/pdf_data/9624_Local_Services_Tax_Regulations.pdf
Saved file: ./data/scraped/scraped_pdf_text_data/9624_Local_Services_Tax_Regula

# Scraping Webpage and Check Duplicate Sublink

Parsing static webpages from the collected sublink file. It is a part of the web crawling process.
Based on the URLs of the collected sublink URLs, this script crawls the webpages and extracts ALL the text data (if opening the sublink URL does not timeout).

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import warnings
from urllib.parse import urljoin, urlparse, urlunparse
from urllib3.exceptions import InsecureRequestWarning
from tqdm import tqdm

# Suppress warnings from unverified HTTPS requests
warnings.filterwarnings('ignore', category=InsecureRequestWarning)

# Suppress XMLParsedAsHTMLWarning if parsing XML with HTML parser
from bs4 import XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# User agents for simulating browser requests
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
]

# Initialize a session for persistent connection
session = requests.Session()

# Function to fetch the page text with a retry mechanism
def fetch_page_text(url, retries=3, timeout=5):
    """Fetch page text and return the parsed BeautifulSoup object."""
    for attempt in range(retries):
        try:
            headers = {'User-Agent': random.choice(user_agents)}
            response = session.get(url, timeout=timeout, headers=headers, verify=False)
            response.raise_for_status()  # Raise an error for bad responses

            soup = BeautifulSoup(response.content, 'lxml')
            return soup.get_text(separator='\n', strip=True), soup

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(1)  # Wait before retrying

    return None, None  # Return None if all attempts fail

# Function to extract all sublinks from the soup object
def extract_sublinks(soup, base_url):
    """Extract and return all sublinks from the HTML soup."""
    sublinks = []
    for link in soup.find_all('a', href=True):
        full_url = urljoin(base_url, link['href'])
        sublinks.append(full_url)
    return sublinks

# Function to save crawled text data to .txt files
def save_crawled_text(url, text, index, folder='./data/scraped/scraped_web_text_data/'):
    """Save crawled text data to individual .txt files."""
    cleaned_text = text.replace('\n', ' ')  # Clean up the text by removing newlines
    output_file = f"{folder}{index}.txt"

    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

# Function to save sublinks to a CSV file
def save_sublinks_to_csv(sublinks_data, output_csv):
    """Save sublink data to a CSV file."""
    df = pd.DataFrame(sublinks_data, columns=['Parent URL', 'Sublink'])
    df.to_csv(output_csv, index=False)

# Function to normalize a URL by removing the fragment
def normalize_url(url):
    """Remove the fragment from a URL."""
    parsed_url = urlparse(url)
    return urlunparse(parsed_url._replace(fragment=''))

# Function to check and remove duplicate URLs based on the base URL
def remove_duplicate_urls(file_path, column_name):
    """Remove duplicate URLs from the specified column in the CSV file."""
    data = pd.read_csv(file_path)
    data['normalized_url'] = data[column_name].apply(normalize_url)
    data = data.dropna(subset=['normalized_url'])  # Remove rows with NaN URLs
    data_cleaned = data.drop_duplicates(subset='normalized_url', keep='first')  # Remove duplicates
    data_cleaned = data_cleaned.reset_index(drop=False)  # Keep original index
    data_cleaned['new_index'] = range(len(data_cleaned))  # New index column
    data_cleaned = data_cleaned.drop(columns=['normalized_url'])  # Drop temporary column

    output_file = file_path.replace('.csv', '_filtered.csv')
    data_cleaned.to_csv(output_file, index=False)
    print(f"Duplicates removed. Cleaned data saved to {output_file}")

# Function to crawl through the provided URLs and fetch data
def crawl_urls(url_list):
    """Crawl the list of URLs and return crawled data and sublinks."""
    results = {}
    sublinks_data = []  # List to store (parent_url, sublink) pairs

    for index, url in tqdm(enumerate(url_list), total=len(url_list)):
        print(f"Fetching: {url}")
        text, soup = fetch_page_text(url)
        if text and soup:
            results[url] = text  # Store the crawled text
            sublinks = extract_sublinks(soup, url)  # Extract sublinks
            sublinks_data.extend((url, sublink) for sublink in sublinks)  # Collect sublink pairs
            save_crawled_text(url, text, index)  # Save the crawled text immediately
        else:
            print(f"Failed to parse URL at index: {index}, URL: {url}")

    return results, sublinks_data

# Function to crawl sublinks
def crawl_and_save_sublinks(urls, indexes):
    """Crawl sublinks and save results."""
    for index, url in tqdm(enumerate(urls), total=len(urls)):
        if url.startswith('http'):
            text, _ = fetch_page_text(url)
            if text:
                save_crawled_text(url, text, indexes[index], folder='./data/scraped/scraped_sublink_text_data/')
            else:
                print(f"Failed to parse sublink at index {index}, URL: {url}")

if __name__ == "__main__":
    # Read the CSV file with main URLs
    main_file_path = './data/raw/csv_data/data_source.csv'
    data = pd.read_csv(main_file_path)

    # Extract non-empty URLs from the 'Source URL' column
    main_urls = data[data['Select'] == 'Webpage']['Source URL'].dropna().unique()

    # Start crawling the main URLs
    crawled_data, sublinks_data = crawl_urls(main_urls)

    # Save sublinks to CSV
    output_csv = './data/scraped/parentlink_file_name_url_mapping.csv'
    save_sublinks_to_csv(sublinks_data, output_csv)

    # Remove duplicates from the sublink CSV
    remove_duplicate_urls(output_csv, 'Sublink')

    # Read the CSV file with sublinks for further crawling
    sublink_file_path = './data/scraped/sublink_file_name_url_mapping_filtered.csv'
    sublink_data = pd.read_csv(sublink_file_path)

    # Extract non-empty URLs from the 'Sublink' column
    sublinks = sublink_data['Value'].dropna().unique()
    indexes = sublink_data['new_index']

    # Start crawling the sublinks
    crawl_and_save_sublinks(sublinks, indexes)

    print("Crawling complete!")


  0%|          | 0/156 [00:00<?, ?it/s]

Fetching: https://en.wikipedia.org/wiki/Pittsburgh


  1%|          | 1/156 [00:03<08:52,  3.43s/it]

Fetching: https://en.wikipedia.org/wiki/History_of_Pittsburgh


  1%|▏         | 2/156 [00:04<04:37,  1.80s/it]

Fetching: https://pittsburghpa.gov/pittsburgh/pgh-about
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/pgh-about
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/pgh-about
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/pgh-about


  2%|▏         | 3/156 [00:08<07:23,  2.90s/it]

Failed to parse URL at index: 2, URL: https://pittsburghpa.gov/pittsburgh/pgh-about
Fetching: https://www.britannica.com/place/Pittsburgh


  3%|▎         | 4/156 [00:09<05:23,  2.13s/it]

Fetching: https://pittsburghpa.gov/events/index.html
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/events/index.html
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/events/index.html
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/events/index.html


  3%|▎         | 5/156 [00:13<06:50,  2.72s/it]

Failed to parse URL at index: 4, URL: https://pittsburghpa.gov/events/index.html
Fetching: https://www.visitpittsburgh.com/blog/move-in-day-pittsburgh-college-guide/


  4%|▍         | 6/156 [00:14<05:45,  2.30s/it]

Fetching: https://pittsburghpa.gov/mayor/pghmayors
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/mayor/pghmayors
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/mayor/pghmayors
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/mayor/pghmayors


  4%|▍         | 7/156 [00:17<06:40,  2.69s/it]

Failed to parse URL at index: 6, URL: https://pittsburghpa.gov/mayor/pghmayors
Fetching: https://en.wikipedia.org/wiki/Economy_of_Pittsburgh


  5%|▌         | 8/156 [00:18<05:01,  2.04s/it]

Fetching: https://en.wikipedia.org/wiki/Greater_Pittsburgh#


  6%|▌         | 9/156 [00:19<04:10,  1.70s/it]

Fetching: https://en.wikipedia.org/wiki/Government_of_Pittsburgh


  6%|▋         | 10/156 [00:20<03:20,  1.38s/it]

Fetching: https://en.wikipedia.org/wiki/Transportation_in_Pittsburgh


  7%|▋         | 11/156 [00:20<02:40,  1.11s/it]

Fetching: https://en.wikipedia.org/wiki/List_of_colleges_and_universities_in_Pittsburgh


  8%|▊         | 12/156 [00:21<02:07,  1.13it/s]

Fetching: https://en.wikipedia.org/wiki/Timeline_of_Pittsburgh


  8%|▊         | 13/156 [00:21<01:51,  1.28it/s]

Fetching: https://en.wikipedia.org/wiki/List_of_people_from_Pittsburgh


  9%|▉         | 14/156 [00:22<02:04,  1.14it/s]

Fetching: https://en.wikipedia.org/wiki/List_of_corporations_in_Pittsburgh


 10%|▉         | 15/156 [00:23<01:51,  1.26it/s]

Fetching: https://pittsburghpa.gov/pittsburgh/pgh-sports
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/pgh-sports
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/pgh-sports
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/pgh-sports


 10%|█         | 16/156 [00:26<03:45,  1.61s/it]

Failed to parse URL at index: 15, URL: https://pittsburghpa.gov/pittsburgh/pgh-sports
Fetching: https://pittsburghpa.gov/pittsburgh/cultural-activities
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/cultural-activities
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/cultural-activities
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/pittsburgh/cultural-activities


 11%|█         | 17/156 [00:30<05:02,  2.17s/it]

Failed to parse URL at index: 16, URL: https://pittsburghpa.gov/pittsburgh/cultural-activities
Fetching: https://www.cmu.edu/about/


 12%|█▏        | 18/156 [00:31<04:07,  1.79s/it]

Fetching: https://www.cmu.edu/global/


 12%|█▏        | 19/156 [00:32<03:27,  1.51s/it]

Fetching: https://en.wikipedia.org/wiki/Carnegie_Mellon_University


 13%|█▎        | 20/156 [00:32<02:46,  1.23s/it]

Fetching: https://www.cmu.edu/student-admission/index.html


 13%|█▎        | 21/156 [00:33<02:13,  1.01it/s]

Fetching: https://www.cmu.edu/admission/


 14%|█▍        | 22/156 [00:33<02:04,  1.07it/s]

Fetching: https://www.cmu.edu/graduate/prospective/index.html


 15%|█▍        | 23/156 [00:34<01:42,  1.30it/s]

Fetching: https://www.cmu.edu/leadership/


 15%|█▌        | 24/156 [00:34<01:30,  1.46it/s]

Fetching: https://www.cmu.edu/strategic-plan/


 16%|█▌        | 25/156 [00:35<01:20,  1.63it/s]

Fetching: https://www.cmu.edu/strategic-plan/about/vision-mission.html


 17%|█▋        | 26/156 [00:35<01:16,  1.71it/s]

Fetching: https://www.cmu.edu/strategic-plan/about/values.html


 17%|█▋        | 27/156 [00:36<01:10,  1.82it/s]

Fetching: https://www.cmu.edu/diversity/


 18%|█▊        | 28/156 [00:36<01:13,  1.74it/s]

Fetching: https://en.wikipedia.org/wiki/Carnegie_Mellon_University_traditions


 19%|█▊        | 29/156 [00:37<01:25,  1.49it/s]

Fetching: https://www.cmu.edu/about/rankings.html


 19%|█▉        | 30/156 [00:38<01:16,  1.65it/s]

Fetching: https://www.cmu.edu/about/awards.html


 20%|█▉        | 31/156 [00:38<01:15,  1.66it/s]

Fetching: https://www.cmu.edu/cfa/music/concerts-events/opera-events.html


 21%|██        | 32/156 [00:39<01:13,  1.69it/s]

Fetching: https://www.cmu.edu/cfa/music/concerts-events/index.html


 21%|██        | 33/156 [00:39<01:10,  1.75it/s]

Fetching: https://events.cmu.edu/all


 22%|██▏       | 34/156 [00:40<01:14,  1.63it/s]

Fetching: https://pittsburghmusicals.com/season/


 22%|██▏       | 35/156 [00:42<01:43,  1.17it/s]

Fetching: https://www.chambermusicpittsburgh.org/2024-2025-mainstage-live/


 23%|██▎       | 36/156 [00:43<02:18,  1.16s/it]

Fetching: https://makemusicpittsburgh.org/


 24%|██▎       | 37/156 [00:44<02:00,  1.01s/it]

Fetching: https://en.wikipedia.org/wiki/Pittsburgh_Symphony_Orchestra


 24%|██▍       | 38/156 [00:45<01:45,  1.12it/s]

Fetching: https://www.pittsburghsymphony.org/feed?format=rss


 25%|██▌       | 39/156 [00:49<03:37,  1.86s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/about-landing/melia-p-tourangeau-president


 26%|██▌       | 40/156 [00:50<03:18,  1.71s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/biographies/pso-conductors/honeck-manfred


 26%|██▋       | 41/156 [00:51<02:46,  1.45s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/musicians


 27%|██▋       | 42/156 [00:52<02:35,  1.36s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/community-landing/learning-programs


 28%|██▊       | 43/156 [00:54<02:35,  1.38s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/subscriptions/why-subscribe-24-25/24-25-fiddlesticks-musical-exploration


 28%|██▊       | 44/156 [00:55<02:26,  1.31s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/community-landing/learning-programs/schooltime-concerts


 29%|██▉       | 45/156 [00:56<02:27,  1.33s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/community-landing/learning-programs/schooltime-concerts/digital-schooltime


 29%|██▉       | 46/156 [00:57<02:14,  1.23s/it]

Fetching: https://www.pittsburghsymphony.org/pso_home/web/give-landing/corporate-partnerships/dining-partners


 30%|███       | 47/156 [00:59<02:20,  1.29s/it]

Fetching: https://www.bandsintown.com/c/pittsburgh-pa?came_from=253&utm_medium=web&utm_source=city_page&utm_campaign=top_event&sort_by_filter=Number+of+RSVPs&concerts=true


 31%|███       | 48/156 [01:00<02:11,  1.22s/it]

Fetching: https://www.visitpittsburgh.com/blog/pittsburgh-music-venues/


 31%|███▏      | 49/156 [01:01<02:25,  1.36s/it]

Fetching: https://en.wikipedia.org/wiki/Pittsburgh_Opera


 32%|███▏      | 50/156 [01:02<02:00,  1.14s/it]

Fetching: https://pittsburghopera.org/about


 33%|███▎      | 51/156 [01:03<01:47,  1.03s/it]

Fetching: https://pittsburghopera.org/about/mission-history


 33%|███▎      | 52/156 [01:03<01:33,  1.12it/s]

Fetching: https://pittsburghopera.org/resident-artists/2024-25resident-artists


 34%|███▍      | 53/156 [01:04<01:24,  1.21it/s]

Fetching: https://pittsburghopera.org/resident-artists/faculty-administration/


 35%|███▍      | 54/156 [01:05<01:20,  1.26it/s]

Fetching: https://pittsburghopera.org/resident-artists/history-alumni/


 35%|███▌      | 55/156 [01:05<01:15,  1.35it/s]

Fetching: https://pittsburghopera.org/our-team/board-of-directors


 36%|███▌      | 56/156 [01:06<01:10,  1.42it/s]

Fetching: https://pittsburghopera.org/our-team/staff


 37%|███▋      | 57/156 [01:06<01:08,  1.45it/s]

Fetching: https://pittsburghopera.org/our-team/orchestra


 37%|███▋      | 58/156 [01:07<01:08,  1.42it/s]

Fetching: https://pittsburghopera.org/our-team/chorus


 38%|███▊      | 59/156 [01:08<01:04,  1.51it/s]

Fetching: https://pittsburghopera.org/facilities/pittsburgh-opera-headquarters/


 38%|███▊      | 60/156 [01:09<01:09,  1.38it/s]

Fetching: https://pittsburghopera.org/support/foundation-support


 39%|███▉      | 61/156 [01:09<01:07,  1.40it/s]

Fetching: https://pittsburghopera.org/support/government-support


 40%|███▉      | 62/156 [01:10<01:03,  1.48it/s]

Fetching: https://pittsburghopera.org/about/inclusion-diversity-equity-accessibility-idea/


 40%|████      | 63/156 [01:11<01:08,  1.35it/s]

Fetching: https://pittsburghopera.org/education/mobile-app


 41%|████      | 64/156 [01:11<01:03,  1.44it/s]

Fetching: https://pittsburghopera.org/education/bravo-academy


 42%|████▏     | 65/156 [01:12<01:03,  1.44it/s]

Fetching: https://pittsburghopera.org/season


 42%|████▏     | 66/156 [01:13<01:00,  1.48it/s]

Fetching: https://pittsburghopera.org/season/tosca


 43%|████▎     | 67/156 [01:14<01:05,  1.36it/s]

Fetching: https://pittsburghopera.org/season/cavalleria-rusticana-pagliacci


 44%|████▎     | 68/156 [01:14<01:01,  1.43it/s]

Fetching: https://pittsburghopera.org/season/armida


 44%|████▍     | 69/156 [01:15<01:01,  1.41it/s]

Fetching: https://pittsburghopera.org/season/madama-butterfly


 45%|████▍     | 70/156 [01:16<00:59,  1.45it/s]

Fetching: https://pittsburghopera.org/season/woman-with-eyes-closed


 46%|████▌     | 71/156 [01:16<00:59,  1.42it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1727755200000&end=1730415540000&hsLang=en


 46%|████▌     | 72/156 [01:17<01:00,  1.40it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1730433600000&end=1733011140000&hsLang=en


 47%|████▋     | 73/156 [01:18<00:53,  1.55it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1733029200000&end=1735689540000&hsLang=en


 47%|████▋     | 74/156 [01:18<00:48,  1.69it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1735707600000&end=1738367940000&hsLang=en


 48%|████▊     | 75/156 [01:19<00:50,  1.60it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1738386000000&end=1740787140000&hsLang=en


 49%|████▊     | 76/156 [01:19<00:46,  1.70it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1740805200000&end=1743461940000&hsLang=en


 49%|████▉     | 77/156 [01:20<00:44,  1.76it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1743480000000&end=1746053940000&hsLang=en


 50%|█████     | 78/156 [01:20<00:43,  1.77it/s]

Fetching: https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1746072000000&end=1748732340000&hsLang=en


 51%|█████     | 79/156 [01:21<00:43,  1.78it/s]

Fetching: https://pittsburghopera.org/season/special-events


 51%|█████▏    | 80/156 [01:22<00:45,  1.65it/s]

Fetching: https://pittsburghopera.org/education/educators


 52%|█████▏    | 81/156 [01:22<00:41,  1.82it/s]

Fetching: https://pittsburghopera.org/education/students


 53%|█████▎    | 82/156 [01:23<00:47,  1.56it/s]

Fetching: https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d


 53%|█████▎    | 83/156 [01:32<03:46,  3.11s/it]

Fetching: https://downtownpittsburgh.com/events/?n=1&y=2025&cat=0


 54%|█████▍    | 84/156 [01:38<04:43,  3.94s/it]

Fetching: https://downtownpittsburgh.com/events/?n=12&y=2024&cat=0
Attempt 1 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)
Attempt 2 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)
Attempt 3 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)


 54%|█████▍    | 85/156 [01:56<09:49,  8.30s/it]

Failed to parse URL at index: 84, URL: https://downtownpittsburgh.com/events/?n=12&y=2024&cat=0
Fetching: https://downtownpittsburgh.com/events/?n=11&y=2024&cat=0
Attempt 1 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)
Attempt 2 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)
Attempt 3 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)


 55%|█████▌    | 86/156 [02:15<13:20, 11.43s/it]

Failed to parse URL at index: 85, URL: https://downtownpittsburgh.com/events/?n=11&y=2024&cat=0
Fetching: https://downtownpittsburgh.com/events/?n=10&y=2024&cat=0
Attempt 1 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)
Attempt 2 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)
Attempt 3 failed: HTTPSConnectionPool(host='downtownpittsburgh.com', port=443): Read timed out. (read timeout=5)


 56%|█████▌    | 87/156 [02:33<15:38, 13.60s/it]

Failed to parse URL at index: 86, URL: https://downtownpittsburgh.com/events/?n=10&y=2024&cat=0
Fetching: https://pittsburgh.events/october/


 56%|█████▋    | 88/156 [02:36<11:34, 10.21s/it]

Fetching: https://pittsburgh.events/november/


 57%|█████▋    | 89/156 [02:38<08:39,  7.75s/it]

Fetching: https://pittsburgh.events/december/


 58%|█████▊    | 90/156 [02:40<06:34,  5.97s/it]

Fetching: https://pittsburgh.events/january/


 58%|█████▊    | 91/156 [02:41<05:05,  4.70s/it]

Fetching: https://pittsburgh.events/february/


 59%|█████▉    | 92/156 [02:43<04:03,  3.80s/it]

Fetching: https://pittsburgh.events/march/


 60%|█████▉    | 93/156 [02:45<03:22,  3.21s/it]

Fetching: https://pittsburgh.events/april/


 60%|██████    | 94/156 [02:47<02:53,  2.80s/it]

Fetching: https://pittsburgh.events/may/


 61%|██████    | 95/156 [02:49<02:33,  2.52s/it]

Fetching: https://pittsburgh.events/june/


 62%|██████▏   | 96/156 [02:50<02:20,  2.34s/it]

Fetching: https://pittsburgh.events/july/


 62%|██████▏   | 97/156 [02:52<02:05,  2.13s/it]

Fetching: https://pittsburgh.events/august/


 63%|██████▎   | 98/156 [02:54<02:05,  2.17s/it]

Fetching: https://pittsburgh.events/september/


 63%|██████▎   | 99/156 [02:56<02:00,  2.11s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/


 64%|██████▍   | 100/156 [02:58<01:54,  2.04s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-steelers/


 65%|██████▍   | 101/156 [03:00<01:40,  1.83s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-pirates/


 65%|██████▌   | 102/156 [03:01<01:28,  1.64s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-penguins/


 66%|██████▌   | 103/156 [03:02<01:21,  1.53s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/college-sports/


 67%|██████▋   | 104/156 [03:03<01:14,  1.43s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/other-sports-teams/


 67%|██████▋   | 105/156 [03:04<01:08,  1.34s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/terrible-towel/


 68%|██████▊   | 106/156 [03:06<01:05,  1.30s/it]

Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-riverhounds/


 69%|██████▊   | 107/156 [03:07<01:01,  1.26s/it]

Fetching: https://www.visitpittsburgh.com/blog/steelers-hall-of-honor-museum/


 69%|██████▉   | 108/156 [03:08<00:55,  1.16s/it]

Fetching: https://www.visitpittsburgh.com/blog/guide-to-acrisure-stadium-for-steelers-fans/


 70%|██████▉   | 109/156 [03:09<00:51,  1.10s/it]

Fetching: https://www.visitpittsburgh.com/blog/what-to-eat-at-pnc-park/


 71%|███████   | 110/156 [03:10<00:54,  1.18s/it]

Fetching: https://www.visitpittsburgh.com/blog/ppg-paints-arena-penguins-guide/


 71%|███████   | 111/156 [03:11<00:51,  1.14s/it]

Fetching: https://www.steelers.com/


 72%|███████▏  | 112/156 [03:12<00:44,  1.01s/it]

Fetching: https://www.steelers.com/schedule/


 72%|███████▏  | 113/156 [03:12<00:38,  1.12it/s]

Fetching: https://en.wikipedia.org/wiki/Myron_Cope


 73%|███████▎  | 114/156 [03:14<00:40,  1.04it/s]

Fetching: https://www.dkpittsburghsports.com/


 74%|███████▎  | 115/156 [03:14<00:37,  1.10it/s]

Fetching: https://en.wikipedia.org/wiki/Sports_in_Pittsburgh


 74%|███████▍  | 116/156 [03:15<00:34,  1.16it/s]

Fetching: https://www.nytimes.com/athletic/location/pittsburgh/


 75%|███████▌  | 117/156 [03:17<00:41,  1.07s/it]

Fetching: https://athletics.cmu.edu/landing/index


 76%|███████▌  | 118/156 [03:18<00:48,  1.29s/it]

Fetching: https://athletics.cmu.edu/sports/mbkb/index


 76%|███████▋  | 119/156 [03:20<00:47,  1.29s/it]

Fetching: https://athletics.cmu.edu/sports/mbkb/2024-25/schedule


 77%|███████▋  | 120/156 [03:21<00:44,  1.23s/it]

Fetching: https://x.com/tartanathletics?lang=en
Attempt 1 failed: 400 Client Error: Bad Request for url: https://x.com/tartanathletics?lang=en
Attempt 2 failed: 400 Client Error: Bad Request for url: https://x.com/tartanathletics?lang=en
Attempt 3 failed: 400 Client Error: Bad Request for url: https://x.com/tartanathletics?lang=en


 78%|███████▊  | 121/156 [03:24<01:03,  1.83s/it]

Failed to parse URL at index: 120, URL: https://x.com/tartanathletics?lang=en
Fetching: https://trustarts.org/


 78%|███████▊  | 122/156 [03:25<00:55,  1.63s/it]

Fetching: https://trustarts.org/pct_home/blog


 79%|███████▉  | 123/156 [03:28<01:00,  1.84s/it]

Fetching: https://trustarts.org/pct_home/about


 79%|███████▉  | 124/156 [03:29<00:51,  1.60s/it]

Fetching: https://trustarts.org/pct_home/about/history


 80%|████████  | 125/156 [03:30<00:45,  1.47s/it]

Fetching: https://trustarts.org/pct_home/about/cultural-district


 81%|████████  | 126/156 [03:30<00:36,  1.23s/it]

Fetching: https://trustarts.org/pct_home/visual-arts


 81%|████████▏ | 127/156 [03:32<00:37,  1.28s/it]

Fetching: https://en.wikipedia.org/wiki/Pittsburgh_Cultural_Trust


 82%|████████▏ | 128/156 [03:32<00:29,  1.05s/it]

Fetching: https://carnegiemuseums.org/


 83%|████████▎ | 129/156 [03:33<00:27,  1.01s/it]

Fetching: https://carnegiemuseums.org/events/?start_date=2024-10-27&end_date=2025-11-29


 83%|████████▎ | 130/156 [03:34<00:26,  1.01s/it]

Fetching: https://en.wikipedia.org/wiki/Carnegie_Museums_of_Pittsburgh


 84%|████████▍ | 131/156 [03:35<00:23,  1.07it/s]

Fetching: https://en.wikipedia.org/wiki/Carnegie_Museum_of_Art


 85%|████████▍ | 132/156 [03:36<00:19,  1.21it/s]

Fetching: https://carnegiemuseums.org/about-us/our-history/


 85%|████████▌ | 133/156 [03:36<00:17,  1.30it/s]

Fetching: https://carnegiemnh.org/


 86%|████████▌ | 134/156 [03:38<00:21,  1.04it/s]

Fetching: https://carnegieart.org/


 87%|████████▋ | 135/156 [03:39<00:21,  1.04s/it]

Fetching: https://en.wikipedia.org/wiki/Carnegie_Museum_of_Natural_History


 87%|████████▋ | 136/156 [03:40<00:19,  1.00it/s]

Fetching: https://en.wikipedia.org/wiki/Heinz_History_Center


 88%|████████▊ | 137/156 [03:41<00:17,  1.06it/s]

Fetching: https://www.heinzhistorycenter.org/


 88%|████████▊ | 138/156 [03:41<00:15,  1.18it/s]

Fetching: https://www.thefrickpittsburgh.org/


 89%|████████▉ | 139/156 [03:42<00:16,  1.01it/s]

Fetching: https://en.wikipedia.org/wiki/The_Frick_Pittsburgh#:~:text=The%20Frick%20Pittsburgh%20is%20a,residence%20known%20as%20%22Clayton%22.


 90%|████████▉ | 140/156 [03:44<00:16,  1.04s/it]

Fetching: https://www.visitpittsburgh.com/events-festivals/food-festivals/


 90%|█████████ | 141/156 [03:45<00:18,  1.23s/it]

Fetching: https://www.picklesburgh.com/


 91%|█████████ | 142/156 [03:46<00:14,  1.01s/it]

Fetching: https://www.picklesburgh.com/vendors/


 92%|█████████▏| 143/156 [03:47<00:14,  1.09s/it]

Fetching: https://www.pghtacofest.com/about


 92%|█████████▏| 144/156 [03:48<00:11,  1.04it/s]

Fetching: https://pittsburghrestaurantweek.com/


 93%|█████████▎| 145/156 [03:48<00:09,  1.14it/s]

Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2024-restaurants/


 94%|█████████▎| 146/156 [03:50<00:10,  1.03s/it]

Fetching: https://littleitalydays.com/


 94%|█████████▍| 147/156 [03:51<00:09,  1.01s/it]

Fetching: https://littleitalydays.com/about-us/


 95%|█████████▍| 148/156 [03:51<00:06,  1.17it/s]

Fetching: https://bananasplitfest.com/


 96%|█████████▌| 149/156 [03:52<00:05,  1.31it/s]

Fetching: https://bananasplitfest.com/events/


 96%|█████████▌| 150/156 [03:53<00:05,  1.09it/s]

Fetching: https://en.wikipedia.org/wiki/Picklesburgh


 97%|█████████▋| 151/156 [03:53<00:03,  1.34it/s]

Fetching: https://pittsburghpa.gov/mayor/city-staff
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/mayor/city-staff
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/mayor/city-staff
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/mayor/city-staff


 97%|█████████▋| 152/156 [03:58<00:07,  1.78s/it]

Failed to parse URL at index: 151, URL: https://pittsburghpa.gov/mayor/city-staff
Fetching: https://pittsburghpa.gov/finance/tax-descriptions
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/finance/tax-descriptions
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/finance/tax-descriptions
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.pittsburghpa.gov/finance/tax-descriptions


 98%|█████████▊| 153/156 [04:02<00:07,  2.42s/it]

Failed to parse URL at index: 152, URL: https://pittsburghpa.gov/finance/tax-descriptions
Fetching: https://www.visitpittsburgh.com/events-festivals/


 99%|█████████▊| 154/156 [04:03<00:04,  2.14s/it]

Fetching: https://www.mlb.com/pirates/team/front-office


 99%|█████████▉| 155/156 [04:04<00:01,  1.91s/it]

Fetching: https://www.steelers.com/team/front-office-roster/


100%|██████████| 156/156 [04:05<00:00,  1.58s/it]


Duplicates removed. Cleaned data saved to ./data/scraped/parentlink_file_name_url_mapping_filtered.csv


  1%|          | 145/14621 [02:28<7:19:59,  1.82s/it]

Attempt 1 failed: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=5)


  9%|▊         | 1244/14621 [22:48<3:26:45,  1.08it/s]

Attempt 1 failed: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Pennsylvania_American_Water
Attempt 2 failed: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Pennsylvania_American_Water
Attempt 3 failed: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Pennsylvania_American_Water


  9%|▊         | 1245/14621 [22:53<8:02:52,  2.17s/it]

Failed to parse sublink at index 1244, URL: https://en.wikipedia.org/w/index.php?title=Pennsylvania_American_Water&action=edit&redlink=1


 10%|█         | 1464/14621 [26:05<3:45:15,  1.03s/it]

Attempt 1 failed: 403 Client Error: Unknown Error for url: https://www.usatoday.com/money/economy/2009-09-21-us-steel-pittsburgh_N.htm
Attempt 2 failed: 403 Client Error: Unknown Error for url: https://www.usatoday.com/money/economy/2009-09-21-us-steel-pittsburgh_N.htm
Attempt 3 failed: 403 Client Error: Unknown Error for url: https://www.usatoday.com/money/economy/2009-09-21-us-steel-pittsburgh_N.htm


 10%|█         | 1465/14621 [26:08<6:02:30,  1.65s/it]

Failed to parse sublink at index 1464, URL: https://www.usatoday.com/money/economy/2009-09-21-us-steel-pittsburgh_N.htm
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.wtae.com/Just-How-Many-Bridges-Are-There-In-Pittsburgh/-/9681798/7685514/-/jaknsc/-/index.html
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.wtae.com/Just-How-Many-Bridges-Are-There-In-Pittsburgh/-/9681798/7685514/-/jaknsc/-/index.html
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.wtae.com/Just-How-Many-Bridges-Are-There-In-Pittsburgh/-/9681798/7685514/-/jaknsc/-/index.html


 10%|█         | 1466/14621 [26:12<7:58:27,  2.18s/it]

Failed to parse sublink at index 1465, URL: http://www.wtae.com/Just-How-Many-Bridges-Are-There-In-Pittsburgh/-/9681798/7685514/-/jaknsc/-/index.html
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.chicagotribune.com/ct-xpm-1987-10-18-8703180822-story.html
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.chicagotribune.com/ct-xpm-1987-10-18-8703180822-story.html
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.chicagotribune.com/ct-xpm-1987-10-18-8703180822-story.html


 10%|█         | 1467/14621 [26:15<9:34:52,  2.62s/it]

Failed to parse sublink at index 1466, URL: http://articles.chicagotribune.com/1987-10-18/travel/8703180822_1_steel-truss-bridge-twin-bridges-arches


 10%|█         | 1469/14621 [26:17<5:51:43,  1.60s/it]

Attempt 1 failed: HTTPConnectionPool(host='www.bivouacbooks.com', port=80): Max retries exceeded with url: /bbv4i4s4.htm (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7c1348763750>: Failed to resolve 'www.bivouacbooks.com' ([Errno -2] Name or service not known)"))
Attempt 2 failed: HTTPConnectionPool(host='www.bivouacbooks.com', port=80): Max retries exceeded with url: /bbv4i4s4.htm (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7c1348de74d0>: Failed to resolve 'www.bivouacbooks.com' ([Errno -2] Name or service not known)"))
Attempt 3 failed: HTTPConnectionPool(host='www.bivouacbooks.com', port=80): Max retries exceeded with url: /bbv4i4s4.htm (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7c134332bb10>: Failed to resolve 'www.bivouacbooks.com' ([Errno -2] Name or service not known)"))


 10%|█         | 1470/14621 [26:26<14:07:38,  3.87s/it]

Failed to parse sublink at index 1469, URL: http://www.bivouacbooks.com/bbv4i4s4.htm
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/local/community-eyewitness/eyewitness-1949-tv-makes-pittsburgh-a-new-promise-247120
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/local/community-eyewitness/eyewitness-1949-tv-makes-pittsburgh-a-new-promise-247120
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/local/community-eyewitness/eyewitness-1949-tv-makes-pittsburgh-a-new-promise-247120


 10%|█         | 1472/14621 [26:31<10:42:35,  2.93s/it]

Failed to parse sublink at index 1470, URL: http://www.post-gazette.com/stories/local/community-eyewitness/eyewitness-1949-tv-makes-pittsburgh-a-new-promise-247120/


 10%|█         | 1474/14621 [26:31<5:43:44,  1.57s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.cmu.edu/epp/graduate/faq_contacts_pittsburgh.html
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.cmu.edu/epp/graduate/faq_contacts_pittsburgh.html
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.cmu.edu/epp/graduate/faq_contacts_pittsburgh.html


 10%|█         | 1476/14621 [26:36<6:36:20,  1.81s/it]

Failed to parse sublink at index 1474, URL: http://www.epp.cmu.edu/graduate/faq_contacts_pittsburgh.html


 10%|█         | 1482/14621 [26:40<3:23:33,  1.08it/s]

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.uschamber.com/blog/innovate-or-die-pittsburgh-chose-innovate
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.uschamber.com/blog/innovate-or-die-pittsburgh-chose-innovate
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.uschamber.com/blog/innovate-or-die-pittsburgh-chose-innovate


 10%|█         | 1483/14621 [26:44<6:36:42,  1.81s/it]

Failed to parse sublink at index 1482, URL: https://www.uschamber.com/blog/innovate-or-die-pittsburgh-chose-innovate


 10%|█         | 1485/14621 [26:44<3:54:54,  1.07s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/business/news/in-desperate-1983-there-was-nowhere-for-pittsburghs-economy-to-go-but-up-667537
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/business/news/in-desperate-1983-there-was-nowhere-for-pittsburghs-economy-to-go-but-up-667537
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/business/news/in-desperate-1983-there-was-nowhere-for-pittsburghs-economy-to-go-but-up-667537


 10%|█         | 1486/14621 [26:49<7:23:13,  2.02s/it]

Failed to parse sublink at index 1485, URL: http://www.post-gazette.com/stories/business/news/in-desperate-1983-there-was-nowhere-for-pittsburghs-economy-to-go-but-up-667537/


 10%|█         | 1489/14621 [26:49<3:12:10,  1.14it/s]

Attempt 1 failed: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/j.ctt9qh7tx
Attempt 2 failed: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/j.ctt9qh7tx
Attempt 3 failed: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/j.ctt9qh7tx


 10%|█         | 1490/14621 [26:52<5:22:00,  1.47s/it]

Failed to parse sublink at index 1489, URL: https://www.jstor.org/stable/j.ctt9qh7tx


 10%|█         | 1491/14621 [26:53<4:23:04,  1.20s/it]

Attempt 1 failed: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/10.2307/j.ctt9qh7tx
Attempt 2 failed: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/10.2307/j.ctt9qh7tx
Attempt 3 failed: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/10.2307/j.ctt9qh7tx


 10%|█         | 1492/14621 [26:56<6:45:24,  1.85s/it]

Failed to parse sublink at index 1491, URL: https://doi.org/10.2307%2Fj.ctt9qh7tx


 10%|█         | 1499/14621 [27:06<4:20:18,  1.19s/it]

Failed to parse sublink at index 1499, URL: http://www.bizjournals.com/pittsburgh/blog/morning-edition/2014/12/eaton-wins-contract-for-solar-installations.html
Attempt 1 failed: HTTPConnectionPool(host='174.143.38.57', port=80): Max retries exceeded with url: /wp-content/uploads/2010/06/S013_ROBOT-RxSellSheet.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7c1343a86cd0>, 'Connection to 174.143.38.57 timed out. (connect timeout=5)'))
Attempt 2 failed: HTTPConnectionPool(host='174.143.38.57', port=80): Max retries exceeded with url: /wp-content/uploads/2010/06/S013_ROBOT-RxSellSheet.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7c1343f718d0>, 'Connection to 174.143.38.57 timed out. (connect timeout=5)'))
Attempt 3 failed: HTTPConnectionPool(host='174.143.38.57', port=80): Max retries exceeded with url: /wp-content/uploads/2010/06/S013_ROBOT-RxSellSheet.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnecti

 10%|█         | 1501/14621 [27:24<17:30:02,  4.80s/it]

Failed to parse sublink at index 1500, URL: http://174.143.38.57/wp-content/uploads/2010/06/S013_ROBOT-RxSellSheet.pdf


 10%|█         | 1505/14621 [27:27<7:00:45,  1.92s/it] 

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.ncfta.net/contact-ncfta.aspx
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.ncfta.net/contact-ncfta.aspx
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.ncfta.net/contact-ncfta.aspx


 10%|█         | 1506/14621 [27:30<8:16:47,  2.27s/it]

Failed to parse sublink at index 1505, URL: http://www.ncfta.net/contact-ncfta.aspx


 10%|█         | 1507/14621 [27:31<7:30:03,  2.06s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.rec.ri.cmu.edu/about/history/
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.rec.ri.cmu.edu/about/history/
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.rec.ri.cmu.edu/about/history/


 10%|█         | 1508/14621 [27:36<9:43:11,  2.67s/it]

Failed to parse sublink at index 1507, URL: http://www.rec.ri.cmu.edu/about/history/


 10%|█         | 1509/14621 [27:36<7:33:44,  2.08s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: http://www.pittsburghlive.com/x/pittsburghtrib/business/s_580675.html
Attempt 2 failed: 404 Client Error: Not Found for url: http://www.pittsburghlive.com/x/pittsburghtrib/business/s_580675.html
Attempt 3 failed: 404 Client Error: Not Found for url: http://www.pittsburghlive.com/x/pittsburghtrib/business/s_580675.html


 10%|█         | 1510/14621 [27:40<8:46:51,  2.41s/it]

Failed to parse sublink at index 1509, URL: http://www.pittsburghlive.com/x/pittsburghtrib/business/s_580675.html
Attempt 1 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/local/region/pittsburgh-region-sees-11th-consecutive-month-of-home-sales-increases-655305
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/local/region/pittsburgh-region-sees-11th-consecutive-month-of-home-sales-increases-655305
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.post-gazette.com/stories/local/region/pittsburgh-region-sees-11th-consecutive-month-of-home-sales-increases-655305


 10%|█         | 1511/14621 [27:44<10:30:40,  2.89s/it]

Failed to parse sublink at index 1510, URL: http://www.post-gazette.com/stories/local/region/pittsburgh-region-sees-11th-consecutive-month-of-home-sales-increases-655305/


 10%|█         | 1512/14621 [27:48<11:55:03,  3.27s/it]



 10%|█         | 1515/14621 [27:50<5:24:03,  1.48s/it]

Attempt 1 failed: 403 Client Error: Forbidden for url: https://www.nytimes.com/2009/01/08/business/economy/08collapse.html
Attempt 2 failed: 403 Client Error: Forbidden for url: https://www.nytimes.com/2009/01/08/business/economy/08collapse.html
Attempt 3 failed: 403 Client Error: Forbidden for url: https://www.nytimes.com/2009/01/08/business/economy/08collapse.html


 10%|█         | 1516/14621 [27:53<7:16:19,  2.00s/it]

Failed to parse sublink at index 1515, URL: https://www.nytimes.com/2009/01/08/business/economy/08collapse.html


 10%|█         | 1518/14621 [27:54<4:58:51,  1.37s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: http://www.zillowstatic.com/vstatic/419b583f682a74b83f007039dd9c49f8/static/pages/visuals/neg-equity-map/v3/map.html?embed=1&loc=4/38.41056/-97.95410
Attempt 2 failed: 404 Client Error: Not Found for url: http://www.zillowstatic.com/vstatic/419b583f682a74b83f007039dd9c49f8/static/pages/visuals/neg-equity-map/v3/map.html?embed=1&loc=4/38.41056/-97.95410
Attempt 3 failed: 404 Client Error: Not Found for url: http://www.zillowstatic.com/vstatic/419b583f682a74b83f007039dd9c49f8/static/pages/visuals/neg-equity-map/v3/map.html?embed=1&loc=4/38.41056/-97.95410


 10%|█         | 1519/14621 [27:57<6:51:31,  1.88s/it]

Failed to parse sublink at index 1518, URL: http://www.zillowstatic.com/vstatic/419b583f682a74b83f007039dd9c49f8/static/pages/visuals/neg-equity-map/v3/map.html?embed=1&loc=4/38.41056/-97.95410


 10%|█         | 1522/14621 [28:01<5:00:59,  1.38s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.carnegielibrary.org/exhibit/hname.html
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.carnegielibrary.org/exhibit/hname.html
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.carnegielibrary.org/exhibit/hname.html


 10%|█         | 1523/14621 [28:06<9:48:57,  2.70s/it]

Failed to parse sublink at index 1522, URL: http://www.carnegielibrary.org/exhibit/hname.html


 10%|█         | 1526/14621 [28:08<5:05:48,  1.40s/it]

Attempt 1 failed: 405 Client Error: Method Not Allowed for url: https://old.post-gazette.com/pg/03001/700027-209.stm
Attempt 2 failed: 405 Client Error: Method Not Allowed for url: https://old.post-gazette.com/pg/03001/700027-209.stm
Attempt 3 failed: 405 Client Error: Method Not Allowed for url: https://old.post-gazette.com/pg/03001/700027-209.stm


 10%|█         | 1527/14621 [28:12<7:51:37,  2.16s/it]

Failed to parse sublink at index 1526, URL: http://old.post-gazette.com/pg/03001/700027-209.stm


 10%|█         | 1528/14621 [28:13<6:12:39,  1.71s/it]

Attempt 1 failed: 404 Client Error: Not Found for url: https://www.carnegielibrary.org/exhibit/hname2.html
Attempt 2 failed: 404 Client Error: Not Found for url: https://www.carnegielibrary.org/exhibit/hname2.html
Attempt 3 failed: 404 Client Error: Not Found for url: https://www.carnegielibrary.org/exhibit/hname2.html


 10%|█         | 1529/14621 [28:19<10:38:53,  2.93s/it]

Failed to parse sublink at index 1528, URL: http://www.carnegielibrary.org/exhibit/hname2.html


 10%|█         | 1530/14621 [28:19<8:03:54,  2.22s/it] 

# Dynamic crawling for the events data

In [None]:
!pip install webdriver_manager
# Update package list and install necessary dependencies
!apt-get update
!apt-get install -y wget unzip libvulkan1

# Download and install Google Chrome
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get install -f -y

# Install xvfb for virtual framebuffer support
!apt-get install -y xvfb

# Install selenium and chromedriver-autoinstaller
!pip install selenium chromedriver-autoinstaller


Hit:1 http://deb.debian.org/debian buster InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://deb.debian.org/debian buster-updates InRelease
Get:4 https://dl.google.com/linux/chrome/deb stable InRelease [1,825 B]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://deb.debian.org/debian-security buster/updates InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:13 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:14 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:15 https:/

In [None]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import time
# Automatically install the correct version of ChromeDriver
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()

# Function to fetch the page content using Selenium
def fetch_page_text_selenium(url, driver):
  try:
      driver.get(url)

      # Wait for the page to load dynamically (up to 10 seconds)
      WebDriverWait(driver, 10).until(
          EC.presence_of_element_located((By.TAG_NAME, "body"))
      )

      # Get page source
      page_source = driver.page_source

      # Parse the page content using BeautifulSoup
      soup = BeautifulSoup(page_source, 'html.parser')

      # Extract and clean the text from the page
      page_text = soup.get_text(separator='\n', strip=True)
      return page_text

  except Exception as e:
      print(f"Error fetching {url}: {e}")
      return None

# Function to read URLs from CSV and crawl each one
def crawl_urls_from_csv(csv_file_path, url_column_name, driver):
  with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
      reader = csv.DictReader(csvfile)
      urls = [row[url_column_name] for row in reader]  # Extract all URLs

      # Use tqdm to show progress
      for index, url in enumerate(tqdm(urls, desc="Crawling URLs")):
          text = fetch_page_text_selenium(url, driver)
          if text:
              # Save the crawled text to a file with the index as the filename
              output_file = f"./data/scraped/events_test/{index}.txt"
              with open(output_file, 'w', encoding='utf-8') as f:
                  f.write(text)
              print(f"Saved content to {output_file}")

if __name__ == "__main__":
    csv_file_path = './data/raw/csv_data/events_after_10_27.csv'
    url_column_name = 'Source URL'

    # Initialize the Chrome WebDriver
    service = Service(executable_path=r'/usr/bin/chromedriver')
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.headless = True
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Start crawling the URLs from the CSV
        crawl_urls_from_csv(csv_file_path, url_column_name, driver)
    except Exception as e:
        print(f"An error occurred during crawling: {e}")
    finally:
        # Ensure the WebDriver is properly closed
        driver.quit()

Crawling URLs:   2%|▎         | 1/40 [00:02<01:30,  2.31s/it]

Saved content to ./data/scraped/events_test/0.txt


Crawling URLs:   5%|▌         | 2/40 [00:04<01:33,  2.47s/it]

Saved content to ./data/scraped/events_test/1.txt


Crawling URLs:   8%|▊         | 3/40 [00:06<01:24,  2.28s/it]

Saved content to ./data/scraped/events_test/2.txt


Crawling URLs:  10%|█         | 4/40 [00:08<01:08,  1.90s/it]

Saved content to ./data/scraped/events_test/3.txt


Crawling URLs:  12%|█▎        | 5/40 [00:11<01:17,  2.22s/it]

Saved content to ./data/scraped/events_test/4.txt


Crawling URLs:  15%|█▌        | 6/40 [00:14<01:25,  2.52s/it]

Saved content to ./data/scraped/events_test/5.txt


Crawling URLs:  18%|█▊        | 7/40 [00:14<01:00,  1.83s/it]

Saved content to ./data/scraped/events_test/6.txt


Crawling URLs:  20%|██        | 8/40 [00:16<00:58,  1.84s/it]

Saved content to ./data/scraped/events_test/7.txt


Crawling URLs:  22%|██▎       | 9/40 [00:18<00:58,  1.89s/it]

Saved content to ./data/scraped/events_test/8.txt


Crawling URLs:  25%|██▌       | 10/40 [00:21<01:06,  2.21s/it]

Saved content to ./data/scraped/events_test/9.txt


Crawling URLs:  28%|██▊       | 11/40 [00:23<01:00,  2.08s/it]

Saved content to ./data/scraped/events_test/10.txt


Crawling URLs:  30%|███       | 12/40 [00:24<00:53,  1.91s/it]

Saved content to ./data/scraped/events_test/11.txt


Crawling URLs:  32%|███▎      | 13/40 [00:26<00:52,  1.94s/it]

Saved content to ./data/scraped/events_test/12.txt


Crawling URLs:  35%|███▌      | 14/40 [00:28<00:47,  1.84s/it]

Saved content to ./data/scraped/events_test/13.txt


Crawling URLs:  38%|███▊      | 15/40 [00:30<00:45,  1.82s/it]

Saved content to ./data/scraped/events_test/14.txt


Crawling URLs:  40%|████      | 16/40 [00:31<00:42,  1.76s/it]

Saved content to ./data/scraped/events_test/15.txt


Crawling URLs:  42%|████▎     | 17/40 [00:33<00:39,  1.73s/it]

Saved content to ./data/scraped/events_test/16.txt


Crawling URLs:  45%|████▌     | 18/40 [00:34<00:33,  1.53s/it]

Saved content to ./data/scraped/events_test/17.txt


Crawling URLs:  48%|████▊     | 19/40 [00:35<00:29,  1.43s/it]

Saved content to ./data/scraped/events_test/18.txt


Crawling URLs:  50%|█████     | 20/40 [00:36<00:25,  1.27s/it]

Saved content to ./data/scraped/events_test/19.txt


Crawling URLs:  52%|█████▎    | 21/40 [00:40<00:38,  2.05s/it]

Saved content to ./data/scraped/events_test/20.txt


Crawling URLs:  55%|█████▌    | 22/40 [00:47<01:06,  3.72s/it]

Saved content to ./data/scraped/events_test/21.txt


Crawling URLs:  57%|█████▊    | 23/40 [00:55<01:21,  4.77s/it]

Saved content to ./data/scraped/events_test/22.txt


Crawling URLs:  60%|██████    | 24/40 [01:03<01:33,  5.83s/it]

Saved content to ./data/scraped/events_test/23.txt


Crawling URLs:  62%|██████▎   | 25/40 [01:11<01:39,  6.61s/it]

Saved content to ./data/scraped/events_test/24.txt


Crawling URLs:  95%|█████████▌| 38/40 [01:23<00:05,  2.88s/it]

Saved content to ./data/scraped/events_test/37.txt


Crawling URLs:  98%|█████████▊| 39/40 [01:26<00:03,  3.08s/it]

Saved content to ./data/scraped/events_test/38.txt


Crawling URLs: 100%|██████████| 40/40 [01:28<00:00,  2.22s/it]

Saved content to ./data/scraped/events_test/39.txt





# data sharding

In [None]:
import os
import re
from transformers import BartTokenizer
from tqdm import tqdm

# Load the tokenizer for the BART model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Clean the text
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Split text into chunks of 1024 tokens (or fewer)
def split_into_token_chunks(text, max_tokens=1024):
    try:
        # Tokenize the text
        tokens = tokenizer.encode(text, truncation=False, return_tensors="pt")[0]

        # Split tokens into chunks of max_tokens
        chunks = []
        for i in range(0, len(tokens), 1000):
            chunk_tokens = tokens[i:i + 1000]

            # Decode tokens back to text
            chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)

            # Verify token count for the chunk
            chunk_token_count = len(chunk_tokens)
            if chunk_token_count > max_tokens:
                raise ValueError(f"Chunk exceeds {max_tokens} tokens: {chunk_token_count}")

            if chunk_text.strip():  # Skip empty chunks
                chunks.append(chunk_text)

        return chunks
    except Exception as e:
        print(f"Error tokenizing or splitting text: {e}")
        return []

# Save the processed text to a file
def save_processed_text(text, output_dir, file_name, chunk_index=None):
    if chunk_index is not None:
        file_name = f"{os.path.splitext(file_name)[0]}_chunk_{chunk_index}.txt"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(text)

# Process each text file in the directory
def process_directory(input_dir, output_dir):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate over all text files in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]  # Remove file extension

            # Read the file content
            text = read_file(file_path)

            # Skip empty files
            if not text.strip():
                print(f"Skipping empty file: {file_name}")
                continue

            # Clean the text
            cleaned_text = clean_text(text)

            # Skip if cleaned text is empty
            if not cleaned_text.strip():
                print(f"Skipping file with no valid text: {file_name}")
                continue

            # Split the text into chunks of 1024 tokens (or fewer)
            chunks = split_into_token_chunks(cleaned_text, max_tokens=1024)

            # Skip if no valid chunks are produced
            if not chunks:
                print(f"Skipping file with no valid chunks: {file_name}")
                continue

            # Save each chunk as a separate file
            if len(chunks) == 1:
                # If there's only one chunk, save it as the original file name
                save_processed_text(chunks[0], output_dir, file_name)
            else:
                # If there are multiple chunks, save each chunk with a suffix
                for index, chunk in enumerate(chunks):
                    save_processed_text(chunk, output_dir, file_name, chunk_index=index)

    print("All files processed.")

# Set your input and output directories
input_directory = './data/scraped/scraped_web_text_data'  # The directory containing your text files
output_directory = './data/scraped/cleaned_and_truncated_text_data'  # Directory to save processed files

# Create the directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Process the directory
process_directory(input_directory, output_directory)

  2%|▏         | 3/159 [00:00<00:28,  5.49it/s]

Skipping empty file: 6.txt
Skipping empty file: 2.txt
Skipping empty file: 4.txt
Skipping empty file: 16.txt
Skipping empty file: 15.txt
Skipping empty file: 85.txt
Skipping empty file: 86.txt
Skipping empty file: 87.txt
Skipping empty file: 88.txt
Skipping empty file: 89.txt
Skipping empty file: 91.txt
Skipping empty file: 90.txt
Skipping empty file: 92.txt
Skipping empty file: 94.txt
Skipping empty file: 93.txt
Skipping empty file: 95.txt
Skipping empty file: 96.txt
Skipping empty file: 97.txt
Skipping empty file: 98.txt
Skipping empty file: 120.txt
Skipping empty file: 151.txt
Skipping empty file: 152.txt


100%|██████████| 159/159 [00:16<00:00,  9.81it/s]

All files processed.





In [None]:
import os
from transformers import pipeline, BartTokenizer
from tqdm import tqdm

# Load the pre-trained summarization model and tokenizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)  # Use GPU (device=0) or CPU (device=-1)

# Read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Summarize a text chunk
def summarize_text(text, max_length=1024):
    try:
        # Skip empty or invalid text
        if not text.strip():
            raise ValueError("Empty text provided")

        # Generate the summary
        summary = summarizer(
            text,
            max_length=max_length,
            min_length=61,
            do_sample=False,
            truncation=True  # Ensure truncation is applied (though text is already truncated)
        )
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return None

# Process each shard file in the directory and create summaries
def process_directory(input_dir, output_dir, summary_output_dir, max_summary_length=1024):
    # Ensure the output directories exist
    if not os.path.exists(summary_output_dir):
        os.makedirs(summary_output_dir)

    # Iterate over all text files (shards) in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(input_dir, file_name)
            text = read_file(file_path)

            # Skip empty files
            if not text.strip():
                print(f"Skipping empty file: {file_name}")
                continue

            # Generate the summary
            summary = summarize_text(text, max_length=max_summary_length)
            if not summary:
                print(f"Failed to summarize {file_name}")
                continue

            # Save the summary
            summary_file_path = os.path.join(summary_output_dir, f"{file_name}_summary.txt")
            with open(summary_file_path, 'w', encoding='utf-8') as summary_file:
                summary_file.write(summary)

# Example usage
input_directory = './data/scraped/cleaned_and_truncated_text_data'
summary_output_directory = './data/scraped/output_summary'

process_directory(
    input_directory,
    summary_output_directory,
    summary_output_directory,
    max_summary_length=1024  # Ensure this does not exceed the model's limit
)

Device set to use cuda:0
  0%|          | 0/473 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 1024, but your input_length is only 1000. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=500)
  0%|          | 1/473 [00:03<31:05,  3.95s/it]Your max_length is set to 1024, but your input_length is only 1003. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=501)
  0%|          | 2/473 [00:04<17:19,  2.21s/it]Your max_length is set to 1024, but your input_length is only 1003. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing ma