In [9]:
!pip install readability-lxml
# !pip install --upgrade transformers

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\onyxs\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import requests
from lxml import html
import pandas as pd
import time
import threading
from urllib.parse import urljoin, urlparse
from collections import deque
from urllib.robotparser import RobotFileParser
from transformers import pipeline

# User-Agent Header to Mimic a Real Browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

REQUEST_DELAY = 1 # Seconds between requests

# Zero-Shot Classification Model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# List of Topics for Classification
TOPIC_CATEGORIES = [
    "Academic Programs & Courses",
    "Admissions & Application Process",
    "Scholarships & Financial Aid",
    "Research & Innovation at ASU",
    "Student Life & Campus Activities",
    "ASU's Global & Online Education",
    "ASU's Commitment to Sustainability",
    "International Student Support",
    "ASU's AI & Tech Initiatives",
    "Sun Devil Athletics & Sports",
    "ASU's History & Rankings",
    "ASU Library & Research Resources",
    "Career Services & Job Support",
    "Housing & Campus Life",
    "Health, Wellness & Counseling Services",
    "ASU Welbeing and Security"
]

##################################################################
from readability import Document

def extract_main_content(html_content):
    """Extracts the main readable content from HTML using readability-lxml."""
    try:
        doc = Document(html_content)
        summary = doc.summary()
        # Remove HTML tags from the summary
        text = html.fromstring(summary).text_content().strip()
        return text
    except Exception as e:
        print(f"Error extracting content: {e}")
        return ""
##################################################################
def classify_topic(text):
    """Classifies webpage topic using Zero-Shot Learning."""
    if not text.strip():
        return "Unknown"
    
    try:
        result = classifier(text, TOPIC_CATEGORIES)
        return result["labels"][0]  # Top predicted category
    except Exception as e:
        print(f"Error in topic classification: {e}")
        return "Unknown"
##################################################################
def get_url_tree(start_url, max_depth=3):
    """
    Crawls a website up to a given depth while respecting robots.txt and rate limits.
    
    Args:
        start_url (str): The initial URL to start crawling.
        max_depth (int): Maximum depth of crawling.

    Returns:
        list: A list of dictionaries containing URL data.
    """
    visited = set()  
    results = []  # Stores results 
    url_queue = deque([(start_url, 0)])  # Queue for BFS crawling

    while url_queue:
        url, depth = url_queue.popleft()

        # Stop if max depth is reached
        if depth > max_depth:
            continue

        # Skip already visited URLs
        if url in visited:
            continue

        visited.add(url)

        
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            response.raise_for_status()  # Raise error for bad responses
            tree = html.fromstring(response.text)

            # Extract meaningful content
            page_text = extract_main_content(response.text) # pass the text, not the tree.
            num_words = len(page_text.split())
            num_chars = len(page_text)

            # Extract title
            title_element = tree.xpath('//title/text()')
            title = title_element[0].strip() if title_element else "Untitled"

            # Classify topic
            topic = classify_topic(page_text)

            # Store results
            results.append({
                'url': url,
                'depth': depth,
                'title': title,
                'topic': topic,
                'word_count': num_words,
                'char_count': num_chars,
                'page_text': page_text 
            })

            # Extract and queue new links
            links = tree.xpath('//a/@href')
            for link in links:
                absolute_link = urljoin(url, link)  # Convert to absolute URL
                parsed_link = urlparse(absolute_link)
                
                # Only follow links within the same domain
                if parsed_link.netloc == urlparse(start_url).netloc and "#" not in absolute_link and can_fetch(absolute_link):
                    url_queue.append((absolute_link, depth + 1))

            # Delay
            time.sleep(REQUEST_DELAY)

        except requests.exceptions.RequestException as e:
            print(f"Error accessing {url}: {e}")

    return results
##################################################################
def ticker(results, interval):
    """Displays a live ticker showing the number of URLs processed."""
    start_time = time.time()
    last_count = 0
    while True:
        current_count = len(results)
        if current_count != last_count:
            elapsed_time = time.time() - start_time
            print(f"Processed {current_count} URLs. Elapsed time: {elapsed_time:.2f} seconds", end='\r')
            last_count = current_count
        time.sleep(interval)

##################################################################
def can_fetch(url, user_agent="*"):
    """Checks if a URL can be fetched according to robots.txt."""
    parsed_url = urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    rp = RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.read()
        return rp.can_fetch(user_agent, url)
    except:
        return True # if robots.txt is unavailable, assume we can crawl.
##################################################################
def main():
    start_url = "https://www.asu.edu/"
    max_depth = 9 # Limit the depth of crawling
    results = []

    # Start the ticker thread
    ticker_thread = threading.Thread(target=ticker, args=(results, 2))
    ticker_thread.daemon = True
    ticker_thread.start()

    # Start crawling
    url_data = get_url_tree(start_url, max_depth=max_depth)

    # timer
    time.sleep(1)

    # Convert results to a DataFrame
    df = pd.DataFrame(url_data)

    # Save results to CSV
    print("\n\nFinal Results:")
    print(df[['url', 'depth', 'title', 'topic', 'word_count', 'char_count']])
    df.to_csv("webpage_analysis_v2.csv", index=False)

if __name__ == "__main__":
    main()