## Collecting the website urls

In [None]:
import requests
from urllib.parse import urljoin, urlparse, urlunparse
from bs4 import BeautifulSoup
from collections import deque
import time
import csv

def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_base_domain(url):
    parsed = urlparse(url)
    return ".".join(parsed.netloc.split(".")[-2:])

def normalize_url(url):
    parsed = urlparse(url)
    return urlunparse((
        parsed.scheme,
        parsed.netloc,
        parsed.path.rstrip('/'),
        parsed.params,
        parsed.query,
        ''
    ))

def recursive_crawl(start_url, delay=0.5):
    base_domain = get_base_domain(start_url)
    html_links = set()
    pdf_links = set()
    image_links = set()
    queue = deque([normalize_url(start_url)])
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    while queue:
        current_url = queue.popleft()
        
        if current_url in html_links or current_url in pdf_links or current_url in image_links:
            continue
            
        time.sleep(delay)

        try:
            response = requests.get(current_url, headers=headers, timeout=10)
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            print(f"Crawling: {current_url} (Found {len(soup.find_all('a'))} links)")

            for a_tag in soup.find_all('a', href=True):
                raw_href = a_tag['href']
                absolute_url = urljoin(current_url, raw_href)
                normalized_url = normalize_url(absolute_url)

                if not is_valid_url(normalized_url) or get_base_domain(normalized_url) != base_domain:
                    continue

                if normalized_url.lower().endswith('.pdf'):
                    pdf_links.add(normalized_url)
                    print(f"PDF Found: {normalized_url}")
                elif any(normalized_url.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']):
                    image_links.add(normalized_url)
                    print(f"Image Found: {normalized_url}")
                else:
                    if normalized_url not in html_links:
                        html_links.add(normalized_url)
                        queue.append(normalized_url)
                        print(f"HTML Page Found: {normalized_url}")

        except Exception as e:
            print(f"Error fetching {current_url}: {str(e)}")

    return html_links, pdf_links, image_links

if __name__ == "__main__":
    start_url = "https://www.langchain.com/"
    print(f"Starting recursive crawl on {start_url}...\n")
    html_links, pdf_links, image_links = recursive_crawl(start_url, delay=0.5)

    # Save to CSV
    with open("website_links.csv", "w", newline='') as csvfile:
        fieldnames = ['links', 'pdf_links', 'image_links']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Write HTML links
        for url in html_links:
            writer.writerow({'links': url, 'pdf_links': '', 'image_links': ''})

        # Write PDF links
        for url in pdf_links:
            writer.writerow({'links': '', 'pdf_links': url, 'image_links': ''})

        # Write image links
        for url in image_links:
            writer.writerow({'links': '', 'pdf_links': '', 'image_links': url})

    print(f"\nTotal links discovered: {len(html_links) + len(pdf_links) + len(image_links)}")
    print("Results saved to website_links.csv")