In [1]:
import trafilatura
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pickle
# Read URLs from the file
urls = open('sources/urls_web.txt').read().split()

# Function to process a single URL
def process_url(url):
    try:
        content = trafilatura.fetch_url(url)
        if content:
            extracted = trafilatura.extract(content, output_format='json', include_comments=False)
            if extracted:
                extracted_json = json.loads(extracted)  # Parse the JSON string
                text = extracted_json.get("text", "")  # Get the "text" field
                
                # Split the text into paragraphs (based on double newline)
                paragraphs = text.split('\n')  # Adjust split condition if necessary
                
                # Filter out paragraphs with less than 3 words
                filtered_paragraphs = [p for p in paragraphs if len(p.split()) >= 3]

                return {"url": url, "Paragraphs": filtered_paragraphs}
    except Exception as e:
        return {"url": url, "Paragraphs": [f"Error: {str(e)}"]}

    return {"url": url, "Paragraphs": []}  # Default if nothing was fetched or extracted

# Use ThreadPoolExecutor for multithreading
results = []
with ThreadPoolExecutor() as executor:
    # Submit all URLs to the executor
    futures = {executor.submit(process_url, url): url for url in urls}

    # Use tqdm to display progress
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing URLs"):
        results.append(future.result())

# Save the results to a JSON file
with open('blog_posts.pickle', 'wb') as json_file:
    pickle.dump(results, json_file)

print("Data has been saved to blog_posts.json.")


Processing URLs: 100%|██████████| 151806/151806 [7:23:20<00:00,  5.71it/s]   


Data has been saved to blog_posts.json.


In [2]:
import pickle
with open('blog_posts.pickle', 'rb') as json_file:
    data = pickle.load(json_file)
print(len(data))

151806


In [15]:
print(len(data[1]["Paragraphs"]))

3


In [16]:
print(data[1]["Paragraphs"][:3])  # Print the first 5 paragraphs of the second URL

['R.I.P. root9B? We Hardly Knew Ya!', 'root9B, a company that many in the security industry considered little more than a big-name startup aimed at cashing in on the stock market’s insatiable appetite for cybersecurity firms, surprised no one this week when it announced it was ceasing operations at the end of the year.', 'Founded in 2011, Colorado Springs, Colo. based root9B Technologies touted itself as an IT security training firm staffed by an impressive list of ex-military leaders with many years of cybersecurity experience at the Department of Defense and National Security Agency (NSA). As it began to attract more attention from investors, root9B’s focus shifted to helping organizations hunt for cyber intruders within their networks.']
