In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_urls_from_sitemap(sitemap_url):
    """
    Extracts and cleans URLs from a sitemap or a sitemap index.
    """
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching URL {sitemap_url}: {e}")
        return []

    if 'xml' not in response.headers.get('Content-Type', ''):
        print(f"Unsupported content type for URL {sitemap_url}")
        return []

    soup = BeautifulSoup(response.content, 'lxml-xml')
    urls = []

    if soup.find('sitemapindex'):
        sitemaps = soup.find_all('sitemap')
        for sitemap in sitemaps:
            sitemap_url = sitemap.find('loc').text.strip()  # Clean the URL
            print(f'Found sitemap index: {sitemap_url}. Adding URLs to list')
            urls.extend(extract_urls_from_sitemap(sitemap_url))
    elif soup.find('urlset'):
        print(f'No nested indexes found. Adding URLs to list')
        locs = soup.find_all('loc')
        urls = [loc.text.strip() for loc in locs]  # Clean each URL

    return urls


def create_dataframe(urls):
    """
    Creates a DataFrame from a list of URLs.
    """
    df = pd.DataFrame(urls, columns=['URL'])
    return df

def get_urls_from_robots_or_sitemap(url):
    """
    Determines whether the given URL is a robots.txt or a sitemap, and acts accordingly.
    """
    if url.endswith('robots.txt'):
        # Handling robots.txt
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching URL {url}: {e}")
            return pd.DataFrame(columns=['URL'])

        # Extract all lines containing 'Sitemap:'
        sitemap_urls = [line.split(': ')[1].strip() for line in response.text.split('\n') if line.startswith('Sitemap:')]

        all_urls = []
        for sitemap_url in sitemap_urls:
            all_urls.extend(extract_urls_from_sitemap(sitemap_url))

    else:
        # Handling sitemap URL
        all_urls = extract_urls_from_sitemap(url)

    df = create_dataframe(all_urls)
    return df

# Example usage
url = 'https://www.askviable.com/robots.txt'  # Replace with either a robots.txt or a sitemap URL
urls_df = get_urls_from_robots_or_sitemap(url)

# Display the dataframe
print(urls_df.head())


In [None]:
list_of_urls = urls_df['URL'].tolist()


In [None]:
list_of_urls

In [None]:
from pathlib import Path
import openai
from dotenv import load_dotenv
import os

In [None]:
 # Load API keys
dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY
GPT_MODEL = "gpt-4-1106-preview"

In [None]:
# making it work just for a rando url

list_of_urls = ["https://www.askviable.com/about-viable"]

In [None]:
list_of_urls

In [None]:
# SimpleWebPageReader

from llama_index import VectorStoreIndex, download_loader

SimpleWebPageReader = download_loader("SimpleWebPageReader")

loader = SimpleWebPageReader()
documents = loader.load_data(urls=list_of_urls)
index = VectorStoreIndex.from_documents(documents, show_progress=True)

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("what is the single most burning pain point viable addresses? An elevator pitch")
response.response