In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import openai
from dotenv import load_dotenv
import os
from llama_index import VectorStoreIndex, download_loader

# Function to extract URLs from a sitemap
def extract_urls_from_sitemap(sitemap_url):
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching URL {sitemap_url}: {e}")
        return []

    if 'xml' not in response.headers.get('Content-Type', ''):
        print(f"Unsupported content type for URL {sitemap_url}")
        return []

    soup = BeautifulSoup(response.content, 'lxml-xml')
    urls = []

    if soup.find('sitemapindex'):
        sitemaps = soup.find_all('sitemap')
        for sitemap in sitemaps:
            sitemap_url = sitemap.find('loc').text.strip()
            print(f'Found sitemap index: {sitemap_url}. Adding URLs to list')
            urls.extend(extract_urls_from_sitemap(sitemap_url))
    elif soup.find('urlset'):
        print(f'No nested indexes found. Adding URLs to list')
        locs = soup.find_all('loc')
        urls = [loc.text.strip() for loc in locs]

    return urls

# Function to create a DataFrame from a list of URLs
def create_dataframe(urls):
    df = pd.DataFrame(urls, columns=['URL'])
    return df

# Function to get URLs from robots.txt or sitemap
def get_urls_from_robots_or_sitemap(url):
    if url.endswith('robots.txt'):
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching URL {url}: {e}")
            return pd.DataFrame(columns=['URL'])

        sitemap_urls = [line.split(': ')[1].strip() for line in response.text.split('\n') if line.startswith('Sitemap:')]

        all_urls = []
        for sitemap_url in sitemap_urls:
            all_urls.extend(extract_urls_from_sitemap(sitemap_url))

        return create_dataframe(all_urls)

    elif url.endswith('sitemap.xml'):
        return create_dataframe(extract_urls_from_sitemap(url))

    else:
        return pd.DataFrame(columns=['URL'])

# Function to check and extract from a domain
def check_and_extract_from_domain(url):
    robots_url = url.rstrip('/') + '/robots.txt'
    sitemap_url = url.rstrip('/') + '/sitemap.xml'
    urls = []

    robots_df = get_urls_from_robots_or_sitemap(robots_url)
    if not robots_df.empty:
        urls.extend(robots_df['URL'].tolist())

    sitemap_urls = extract_urls_from_sitemap(sitemap_url)
    if sitemap_urls:
        urls.extend(sitemap_urls)

    if not urls:
        urls = [url]

    return urls

# Load API keys
dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY
GPT_MODEL = "gpt-4-1106-preview"

# Example usage
base_url = 'https://www.kraftful.com'
list_of_urls = check_and_extract_from_domain(base_url)

# RAG process
SimpleWebPageReader = download_loader("SimpleWebPageReader")
loader = SimpleWebPageReader()
documents = loader.load_data(urls=list_of_urls)
index = VectorStoreIndex.from_documents(documents, show_progress=True)
query_engine = index.as_query_engine()

# Example query
response = query_engine.query("What does this product do?")
print(response.response)


Error fetching URL https://www.kraftful.com/robots.txt: HTTPSConnectionPool(host='www.kraftful.com', port=443): Max retries exceeded with url: /robots.txt (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
No nested indexes found. Adding URLs to list


  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 36/36 [00:01<00:00, 19.15it/s]
Generating embeddings: 100%|██████████| 386/386 [06:48<00:00,  1.06s/it]


This product provides AI tools for product managers and UX researchers to analyze user feedback and data, allowing them to gain insights and make informed decisions. It also helps in summarizing user feedback quickly and efficiently, reducing the time it takes to answer user queries.


In [2]:
query_engine = index.as_chat_engine


"I'm sorry, but I cannot provide a list of competitors for the product mentioned in the context information. My responses are based solely on the given context and not on any external knowledge or information."