In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Base URL for OpenWeatherMap
url = "https://openweathermap.org/api"
api_key = "876cdbbd2c7e99428833077d58602cff"  # Replace with your valid API key

# Stop words for keyword filtering
STOP_WORDS = set(stopwords.words("english"))

# Function to scrape the webpage
def scrape_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        # 1. Data Extraction
        title = soup.title.string if soup.title else "No title found"
        meta_desc = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
        meta_description = meta_desc["content"] if meta_desc else "No description found"
        headings = {f"h{i}": [h.get_text(strip=True) for h in soup.find_all(f"h{i}")] for i in range(1, 7)}
        api_categories = [li.get_text(strip=True) for li in soup.find_all("li")]
        
        # Extract and count internal and external links
        domain = urlparse(url).netloc
        internal_links = set()
        external_links = set()
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            parsed_href = urlparse(href)
            if not parsed_href.netloc or parsed_href.netloc == domain:
                internal_links.add(urljoin(url, href))
            else:
                external_links.add(href)

        # 2. Content Analysis
        main_content = soup.get_text()
        word_tokens = word_tokenize(main_content)
        filtered_tokens = [word.lower() for word in word_tokens if word.isalpha() and word.lower() not in STOP_WORDS]
        keyword_counts = Counter(filtered_tokens)
        total_word_count = len(filtered_tokens)
        top_keywords = keyword_counts.most_common(10)

        # Top 3 most discussed topics or themes (based on keywords)
        top_themes = keyword_counts.most_common(3)

        # Average word count per section based on headings
        section_word_counts = {}
        average_word_count = {}

        for heading, text_list in headings.items():
            section_word_counts[heading] = sum(len(h.split()) for h in text_list)
            average_word_count[heading] = section_word_counts[heading] / len(text_list) if len(text_list) > 0 else 0

        # 3. Section Sentiment Analysis
        section_sentiments = {heading: TextBlob(" ".join(text_list)).sentiment.polarity for heading, text_list in headings.items()}

        # 4. Content Flow Analysis
        content_flow = "The sections appear logically structured and flow well from introduction to conclusion." if "h1" in headings and len(headings["h1"]) > 0 else "Content appears scattered or lacks a clear structure."

        # Return all extracted data
        return {
            "Title": title,
            "Meta Description": meta_description,
            "Headings": headings,
            "API Categories": api_categories,
            "Internal Links Count": len(internal_links),
            "External Links Count": len(external_links),
            "Internal Links": list(internal_links),
            "External Links": list(external_links),
            "Total Word Count": total_word_count,
            "Top Keywords": top_keywords,
            "Top Themes": top_themes,
            "Section Word Counts": section_word_counts,
            "Average Word Count per Section": average_word_count,
            "Section Sentiments": section_sentiments,
            "Content Flow": content_flow,
        }
    else:
        return {"Error": f"Failed to fetch webpage. Status code: {response.status_code}"}

# Execute the scraping function
result = scrape_webpage(url)

# Print results for testing purposes
for key, value in result.items():
    print(f"{key}:\n{value}\n")
  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Title:
 Weather API - OpenWeatherMap

Meta Description:
Explore OpenWeather's vast range of weather APIs including the versatile One Call API 3.0. Ideal for both
beginners and professionals, our APIs offer current weather, minute-by-minute forecasts, historical data archives, and
future predictions. Access weather data starting from 01-01-1979, global weather maps, solar irradiance predictions, air
pollution data, and more. Our APIs support various formats like JSON, XML, and are ideal for middle-sized projects to
enterprise-level solutions.

Headings:
{'h1': ['Weather API'], 'h2': ['Ulla', 'One Call API 3.0', 'Professional collections', 'Current & Forecast weather data collection', 'Solar Irradiance & Energy Prediction service', 'Historical weather data collection', 'Maps collection', "Other weather API's collection", 'API documentation'], 'h3': ['OpenWeather AI assistant', 'Pay as you call', '1,000API calls per day for\n                                freeper API call over the daily 

In [9]:
import os
import webbrowser

# Function to save results to an HTML file and open it in a browser
def save_to_webpage(data, filename="output.html"):
    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Web Scraping Results</title>
        <style>
            body { font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }
            h1 { color: #333; }
            h2 { color: #555; margin-top: 20px; }
            pre { background: #f4f4f4; padding: 10px; border-radius: 5px; overflow: auto; }
        </style>
    </head>
    <body>
        <h1>Web Scraping Results</h1>
    """

    for key, value in data.items():
        html_content += f"<h2>{key}</h2>"
        if isinstance(value, dict):
            html_content += "<pre>" + "\n".join([f"{k}: {v}" for k, v in value.items()]) + "</pre>"
        elif isinstance(value, list):
            html_content += "<pre>" + "\n".join([f"{item[0]}: {item[1]}" if isinstance(item, tuple) else str(item) for item in value]) + "</pre>"
        else:
            html_content += f"<pre>{value}</pre>"

    html_content += """
    </body>
    </html>
    """

    # Save the HTML file
    with open(filename, "w", encoding="utf-8") as file:
        file.write(html_content)

    # Open the HTML file in the default web browser
    webbrowser.open("file://" + os.path.abspath(filename))
    print(f"Results saved and opened in browser: {filename}")

# Execute the scraping function
result = scrape_webpage(url)

# Save and display the results in a new webpage
save_to_webpage(result, "web_scraping_results.html")


Results saved and opened in browser: web_scraping_results.html
