In [1]:
import requests
from bs4 import BeautifulSoup
import dask.bag as db
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import time

# --- Step 1: Safe Setup ---
# We use a try-except block to check if stopwords exist first.
# This prevents the PermissionError by avoiding unnecessary downloads.
try:
    nltk.data.find('corpora/stopwords')
    print("Stopwords are already downloaded.")
except LookupError:
    print("Downloading stopwords...")
    nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# ---------------------------------------------------------
# Step 2: Define Functions
# ---------------------------------------------------------

def fetch_page(url):
    """Fetches HTML content from a URL using headers to avoid being blocked."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        print(f"Fetching: {url}...") 
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch {url}: Status Code {response.status_code}")
            return ""
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

def parse_words(html):
    """Extracts text from HTML, cleans it, and returns a list of words."""
    if not html:
        return []
    
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    
    # Keep only alphabetic characters and convert to lowercase
    words = [word.lower() for word in text.split() if word.isalpha()]
    
    # Remove stopwords and very short words (length < 3)
    clean_words = [w for w in words if w not in stop_words and len(w) > 2]
    return clean_words

# ---------------------------------------------------------
# Step 3: Main Execution Block
# ---------------------------------------------------------

if __name__ == '__main__': 
    
    # List of URLs to scrape
    urls = [
       'https://en.wikipedia.org/wiki/Artificial_intelligence', # AI Topic
       'https://www.python.org/doc/essays/blurb/',              # Python Info
       'https://www.w3.org/standards/webdesign/htmlcss'         # Web Standards
    ]

    print("Starting Parallel Crawling with Dask...")
    start_time = time.time()

    # 1. Create a Dask Bag
    bag = db.from_sequence(urls)

    # 2. Fetch pages
    html_pages = bag.map(fetch_page)

    # 3. Parse words
    words_bag = html_pages.map(parse_words).flatten()

    # 4. Compute frequencies
    try:
        word_counts = words_bag.frequencies().compute()
    except Exception as e:
        print("An error occurred during Dask computation:", e)
        word_counts = []

    if not word_counts:
        print("No words found! Please check your internet connection or the URLs.")
    else:
        word_counts_dict = dict(word_counts)

        end_time = time.time()
        print(f"Done! Total time taken: {end_time - start_time:.2f} seconds")

        # ---------------------------------------------------------
        # Step 5: Data Analysis & Visualization
        # ---------------------------------------------------------

        df = pd.DataFrame(list(word_counts_dict.items()), columns=['Word', 'Count'])
        df = df.sort_values('Count', ascending=False).head(20)

        print("\nTop 10 Words Found:")
        print(df.head(10))
        
        # --- Plotting ---
        plt.figure(figsize=(14, 6))

        # Plot 1: Bar Chart
        plt.subplot(1, 2, 1)
        plt.bar(df['Word'], df['Count'], color='teal')
        plt.xlabel('Words')
        plt.ylabel('Frequency')
        plt.title('Top 20 Words Frequency')
        plt.xticks(rotation=45)

        # Plot 2: Word Cloud
        plt.subplot(1, 2, 2)
        wc = WordCloud(width=400, height=300, background_color='white').generate_from_frequencies(word_counts_dict)
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud Visualization')

        plt.tight_layout()
        plt.show()

BadZipFile: File is not a zip file