# Web Crawling

In [1]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print("Current working directory:", current_directory)


Current working directory: /workspace/scraping + llm


In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

# To avoid scraping the same URL multiple times, we'll use a set.
visited_links = set()
scraped_data = []

def scrape_data(url, depth):
    if depth > 5 or url in visited_links:
        return
    
    visited_links.add(url)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the data from the current URL
        print(f"Scraping URL: {url}")
        page_content = soup.get_text(separator="\n")
        # scraped_data.append(f"URL: {url}\n{page_content[:500]}\n{'-'*80}\n") #(for testing) I am saving only 500 characters from each page since it is taking too much time to save entire content
        scraped_data.append(f"URL: {url}\n{page_content}\n{'-'*80}\n")
        # Find all sub-links on the current page
        for link in soup.find_all('a', href=True):
            sub_link = urljoin(url, link['href'])
            parsed_url = urlparse(sub_link)

            # Ensure the sub_link is within the same domain and is a valid URL
            if parsed_url.netloc == urlparse(url).netloc and parsed_url.scheme in ["http", "https"]:
                scrape_data(sub_link, depth + 1)
    
    except requests.RequestException as e:
        print(f"Failed to retrieve URL: {url} due to {e}")
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")

# Start scraping from the parent URL
parent_url = "https://docs.nvidia.com/cuda/"
scrape_data(parent_url, 0)

# Save the parsed data into a text file
with open('scraped_data.txt', 'w', encoding='utf-8') as file:
    for data in scraped_data:
        file.write(data)

print('Scraping completed. Data saved to scraped_data.txt')


Scraping URL: https://docs.nvidia.com/cuda/
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/contents.html
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-libraries
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-toolkit-major-component-versions
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#new-features
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#general-cuda
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-compiler
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-developer-tools
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#resolved-issues
Scraping URL: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id2
Scraping 

KeyboardInterrupt: 

# Data Chunking

In [4]:
# !python -m spacy download en_core_web_sm

import spacy
from sentence_transformers import SentenceTransformer
import numpy as np

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000  # Increase max length to 2 million characters
model = SentenceTransformer('all-MiniLM-L6-v2')

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    
    # Split the data by the separator used in the scraped data file
    pages = data.split("\n" + "-"*80 + "\n")
    parsed_data = []
    
    for page in pages:
        if page.strip():
            parts = page.split("\n", 1)
            url = parts[0].replace("URL: ", "")
            text = parts[1] if len(parts) > 1 else ""
            parsed_data.append({'url': url, 'text': text})
    
    return parsed_data

def chunk_text(text, max_length=1000000):
    """Split text into smaller chunks of specified maximum length."""
    for i in range(0, len(text), max_length):
        yield text[i:i + max_length]

def chunk_data(scraped_data):
    chunked_data = []
    
    for page_data in scraped_data:
        chunks = []
        text = page_data.get('text', '')
        
        # Split text into smaller chunks if it's too long
        text_chunks = chunk_text(text, max_length=1000000)
        
        for text_chunk in text_chunks:
            # Process text using spaCy for sentence segmentation
            doc = nlp(text_chunk)
            
            # Chunk sentences based on similarity
            current_chunk = []
            for sent in doc.sents:
                if current_chunk:
                    # Check semantic similarity between current chunk and new sentence
                    chunk_embedding = model.encode(" ".join([str(s) for s in current_chunk]))
                    sent_embedding = model.encode(sent.text)
                    similarity = np.dot(chunk_embedding, sent_embedding) / (np.linalg.norm(chunk_embedding) * np.linalg.norm(sent_embedding))
                    
                    if similarity < 0.7:  # Threshold for semantic similarity
                        chunks.append(" ".join([str(s) for s in current_chunk]))
                        current_chunk = []
                
                current_chunk.append(sent)
            
            if current_chunk:
                chunks.append(" ".join([str(s) for s in current_chunk]))
        
        page_data['chunks'] = chunks
        chunked_data.append(page_data)
    
    return chunked_data

def save_chunked_data(chunked_data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for page_data in chunked_data:
            for chunk in page_data['chunks']:
                f.write(chunk + '\n')

if __name__ == '__main__':
    input_file = 'scraped_data.txt'
    chunked_text_file = 'chunked_text.txt'
    
    # Load scraped data
    scraped_data = load_data(input_file)
    
    # Chunk the data based on semantic similarity or topics
    chunked_data = chunk_data(scraped_data)
    
    # Save chunked data to text file
    save_chunked_data(chunked_data, chunked_text_file)
    
    print(f'Chunking completed. Data saved to {chunked_text_file}')

Chunking completed. Data saved to chunked_text.txt


In [3]:
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1000000  # Reduced max length for faster processing
model = SentenceTransformer('all-MiniLM-L6-v2')

def load_data(file_path, max_pages=5):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    
    # Split the data by the separator used in the scraped data file
    pages = data.split("\n" + "-"*80 + "\n")
    parsed_data = []
    
    for page in pages[:max_pages]:  # Limit to max_pages
        if page.strip():
            parts = page.split("\n", 1)
            url = parts[0].replace("URL: ", "")
            text = parts[1] if len(parts) > 1 else ""
            parsed_data.append({'url': url, 'text': text})
    
    return parsed_data

def chunk_data(scraped_data, max_sentences=50):
    chunked_data = []
    
    for page_data in scraped_data:
        chunks = []
        text = page_data.get('text', '')
        
        # Process text using spaCy for sentence segmentation
        doc = nlp(text[:100000])  # Limit text length for faster processing
        
        # Chunk sentences based on similarity
        current_chunk = []
        for sent in list(doc.sents)[:max_sentences]:  # Limit number of sentences
            if current_chunk:
                # Check semantic similarity between current chunk and new sentence
                chunk_embedding = model.encode(" ".join([str(s) for s in current_chunk]))
                sent_embedding = model.encode(sent.text)
                similarity = np.dot(chunk_embedding, sent_embedding) / (np.linalg.norm(chunk_embedding) * np.linalg.norm(sent_embedding))
                
                if similarity < 0.7:  # Threshold for semantic similarity
                    chunks.append(" ".join([str(s) for s in current_chunk]))
                    current_chunk = []
            
            current_chunk.append(sent)
        
        if current_chunk:
            chunks.append(" ".join([str(s) for s in current_chunk]))
        
        page_data['chunks'] = chunks
        chunked_data.append(page_data)
    
    return chunked_data

def save_chunked_data(chunked_data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for page_data in chunked_data:
            for chunk in page_data['chunks']:
                f.write(chunk + '\n')

if __name__ == '__main__':
    input_file = 'scraped_data.txt'
    chunked_text_file = 'chunked_text.txt'
    
    # Load scraped data (limited to 5 pages)
    scraped_data = load_data(input_file, max_pages=5)
    
    # Chunk the data based on semantic similarity or topics (limited to 50 sentences per page)
    chunked_data = chunk_data(scraped_data, max_sentences=50)
    
    # Save chunked data to text file
    save_chunked_data(chunked_data, chunked_text_file)
    
    print(f'Chunking completed. Data saved to {chunked_text_file}')

Chunking completed. Data saved to chunked_text_small.txt


# Chunks to Embeddings

In [4]:
import gc
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import re
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = SentenceTransformer('all-MiniLM-L6-v2')
model.to(device)

def chunk_generator(file_path, batch_size=1000):
    with open(file_path, 'r', encoding='utf-8') as f:
        chunks = []
        for line in f:
            chunks.append(line.strip())
            if len(chunks) == batch_size:
                yield chunks
                chunks = []
        if chunks:
            yield chunks

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return [word for word in text.split() if word not in STOPWORDS]

def create_topic_model(chunk_gen, num_topics=10):
    dictionary = corpora.Dictionary()
    corpus = []
    for chunks in chunk_gen:
        preprocessed_chunks = [preprocess_text(chunk) for chunk in chunks]
        dictionary.add_documents(preprocessed_chunks)
        corpus.extend([dictionary.doc2bow(text) for text in preprocessed_chunks])
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
    return lda_model, dictionary

def get_chunk_topic(chunk, lda_model, dictionary):
    bow = dictionary.doc2bow(preprocess_text(chunk))
    topics = lda_model.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[0] if topics else None

@torch.no_grad()
def process_and_save_chunks(chunks, lda_model, dictionary, output_file, batch_size=64):
    with open(output_file, 'a', encoding='utf-8') as f:
        for i in tqdm(range(0, len(chunks), batch_size), desc="Processing batches"):
            batch_chunks = chunks[i:i + batch_size]
            try:
                embeddings = model.encode(batch_chunks, convert_to_tensor=True, device=device)
                topics = [get_chunk_topic(chunk, lda_model, dictionary) for chunk in batch_chunks]
                
                for chunk, embedding, topic in zip(batch_chunks, embeddings, topics):
                    embedding_str = ' '.join(map(str, embedding.cpu().numpy().tolist()))
                    f.write(f"{chunk}\t{embedding_str}\t{topic}\n")
            except Exception as e:
                print(f"Error processing batch {i//batch_size}: {e}")
            
            torch.cuda.empty_cache()

if __name__ == '__main__':
    chunked_text_file = 'chunked_text.txt'
    output_file = 'chunked_data_with_embeddings.txt'
    
    try:
        print("Creating topic model...")
        lda_model, dictionary = create_topic_model(chunk_generator(chunked_text_file))
        
        print("Processing chunks and saving data...")
        for chunks in chunk_generator(chunked_text_file):
            process_and_save_chunks(chunks, lda_model, dictionary, output_file, batch_size=64)
            gc.collect()
        
        print(f'Embedding conversion and topic modeling completed. Data saved to {output_file}')
    except Exception as e:
        print(f"Error: {e}")

Using device: cuda
Creating topic model...
Processing chunks and saving data...


Processing batches: 100%|██████████| 3/3 [00:00<00:00, 19.38it/s]


Embedding conversion and topic modeling completed. Data saved to chunked_data_with_embeddings.txt


In [68]:
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import wordnet
import re

# Download required NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def load_index_and_metadata(index_type):
    index = faiss.read_index(f"cuda_docs_{index_type.lower()}.index")
    with open(f"cuda_docs_{index_type.lower()}_metadata.pkl", 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

def create_bm25_index(metadata):
    tokenized_corpus = [doc[0].split() for doc in metadata]
    return BM25Okapi(tokenized_corpus)

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    return text.lower()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def expand_query(query, top_k=3):
    expanded_terms = []
    for word in query.split():
        synsets = wordnet.synsets(word)
        word_expanded = []
        for synset in synsets:
            for lemma in synset.lemmas():
                if lemma.name() != word and lemma.name() not in word_expanded:
                    word_expanded.append(lemma.name())
                    if len(word_expanded) == top_k:
                        break
            if len(word_expanded) == top_k:
                break
        expanded_terms.extend(word_expanded)
    return query + ' ' + ' '.join(expanded_terms)

def pseudo_relevance_feedback(query_vector, index, metadata, top_k=5, alpha=0.3):
    # Perform initial search
    distances, indices = index.search(query_vector.reshape(1, -1), top_k)
    
    # Get the top-k documents
    top_docs = [metadata[i][0] for i in indices[0]]
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(top_docs)
    
    # Calculate centroid of top-k documents
    centroid = tfidf_matrix.mean(axis=0)
    
    # Expand query vector
    expanded_query_vector = query_vector + alpha * model.encode(vectorizer.get_feature_names_out()[centroid.argmax()])
    
    return expanded_query_vector

def hybrid_search(dense_index, bm25_index, metadata, query, k=5, alpha=0.5, use_query_expansion=True, use_prf=True):
    # Preprocess and optionally expand the query
    preprocessed_query = preprocess_text(query)
    if use_query_expansion:
        expanded_query = expand_query(preprocessed_query)
    else:
        expanded_query = preprocessed_query
    
    # Dense retrieval
    query_vector = model.encode(expanded_query)
    if use_prf:
        query_vector = pseudo_relevance_feedback(query_vector, dense_index, metadata)
    dense_distances, dense_indices = dense_index.search(query_vector.reshape(1, -1), k*2)
    
    # BM25 retrieval
    bm25_scores = bm25_index.get_scores(expanded_query.split())
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:k*2]
    
    # Combine results
    combined_scores = {}
    for i, idx in enumerate(dense_indices[0]):
        combined_scores[idx] = alpha * (1 - dense_distances[0][i])  # Convert distance to similarity
    
    for i, idx in enumerate(bm25_top_indices):
        if idx in combined_scores:
            combined_scores[idx] += (1 - alpha) * (bm25_scores[idx] / max(bm25_scores))
        else:
            combined_scores[idx] = (1 - alpha) * (bm25_scores[idx] / max(bm25_scores))
    
    # Sort and get top k results
    top_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)[:k]
    
    results = []
    for idx in top_indices:
        chunk, topic, url = metadata[idx]
        results.append({
            "chunk": chunk,
            "topic": topic,
            "url": url,
            "score": combined_scores[idx]
        })
    return results

def main():
    index_type = "FLAT"  # or "IVF"
    dense_index, metadata = load_index_and_metadata(index_type)
    bm25_index = create_bm25_index(metadata)
    
    while True:
        query = input("Enter your query (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break
        
        results = hybrid_search(dense_index, bm25_index, metadata, query, k=5, alpha=0.5, use_query_expansion=True, use_prf=True)
        
        print("\nSearch Results:")
        for i, result in enumerate(results, 1):
            print(f"{i}. Chunk: {result['chunk'][:100]}...")
            print(f"   Topic: {result['topic']}")
            print(f"   URL: {result['url']}")
            print(f"   Score: {result['score']}")
            print("---")

if __name__ == "__main__":
    main()

Enter your query (or 'quit' to exit):  What is Cuda



Search Results:
1. Chunk: Maxwell is NVIDIAâs 4th-generation architecture for CUDA compute applications....
   Topic: 9
   URL: https://docs.nvidia.com/cuda/
   Score: 0.5
---
2. Chunk: Pascal is NVIDIAâs 5th-generation architecture for CUDA compute applications....
   Topic: 2
   URL: https://docs.nvidia.com/cuda/
   Score: 0.5
---
3. Chunk: Volta is NVIDIAâs 6th-generation architecture for CUDA compute applications....
   Topic: 5
   URL: https://docs.nvidia.com/cuda/
   Score: 0.5
---
4. Chunk: Turing is NVIDIAâs 7th-generation architecture for CUDA compute applications....
   Topic: 9
   URL: https://docs.nvidia.com/cuda/
   Score: 0.5
---
5. Chunk: This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by le...
   Topic: 8
   URL: https://docs.nvidia.com/cuda/
   Score: 0.45411295605086405
---


Enter your query (or 'quit' to exit):  quit


In [70]:
from openai import OpenAI
import os
from typing import List, Dict
import numpy as np
from sentence_transformers import SentenceTransformer

# Set your OpenAI API key directly here
openai_key = "OPENAI_KEY"

# Initialize OpenAI client
client = OpenAI(api_key=openai_key)

# Initialize SentenceTransformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def prepare_context(results: List[Dict], query: str) -> str:
    """Prepare the context for the LLM from the search results, ranking by relevance to the query."""
    # Encode the query and chunks
    query_embedding = sentence_model.encode(query)
    chunk_embeddings = sentence_model.encode([result['chunk'] for result in results])
    
    # Calculate cosine similarities
    similarities = np.dot(chunk_embeddings, query_embedding) / (np.linalg.norm(chunk_embeddings, axis=1) * np.linalg.norm(query_embedding))
    
    # Sort results by similarity
    sorted_results = [result for _, result in sorted(zip(similarities, results), key=lambda x: x[0], reverse=True)]
    
    context = "Here are some relevant passages from the CUDA documentation, ordered by relevance:\n\n"
    for i, result in enumerate(sorted_results, 1):
        context += f"{i}. {result['chunk']}\n\n"
    return context

def answer_question(query: str, results: List[Dict]) -> str:
    """Use GPT to answer the question based on the retrieved and ranked results."""
    context = prepare_context(results, query)
    
    messages = [
        {"role": "system", "content": """You are a helpful assistant that answers questions about CUDA based on the provided context. 
        Follow these guidelines:
        1. Always base your answers on the information provided in the context.
        2. If the answer cannot be found in the context, clearly state that you don't have enough information to answer accurately.
        3. If the context contains conflicting information, mention this and explain the different viewpoints.
        4. Use technical terms correctly and explain them if they're complex.
        5. If appropriate, structure your answer with bullet points or numbered lists for clarity.
        6. Cite the relevant passage numbers from the context to support your answer.
        7. If the user's question is unclear, ask for clarification before attempting to answer."""},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"}
    ]
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=1000,
        n=1,
        stop=None,
        temperature=0.3,  # Lower temperature for more focused answers
    )
    
    return response.choices[0].message.content.strip()

def main():
    index_type = "FLAT"  # or "IVF"
    dense_index, metadata = load_index_and_metadata(index_type)  # Assuming these functions are defined elsewhere
    bm25_index = create_bm25_index(metadata)  # Assuming this function is defined elsewhere
    
    while True:
        query = input("Enter your question about CUDA (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break
        
        results = hybrid_search(dense_index, bm25_index, metadata, query, k=5, alpha=0.5, use_query_expansion=True, use_prf=True)
        
        print("\nRetrieved Passages:")
        for i, result in enumerate(results, 1):
            print(f"{i}. {result['chunk'][:100]}...")
        
        answer = answer_question(query, results)
        print("\nAnswer:")
        print(answer)
        print("---")

if __name__ == "__main__":
    main()

Enter your question about CUDA (or 'quit' to exit):  What is CUDA



Retrieved Passages:
1. Maxwell is NVIDIAâs 4th-generation architecture for CUDA compute applications....
2. Pascal is NVIDIAâs 5th-generation architecture for CUDA compute applications....
3. Volta is NVIDIAâs 6th-generation architecture for CUDA compute applications....
4. Turing is NVIDIAâs 7th-generation architecture for CUDA compute applications....
5. This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by le...

Answer:
CUDA stands for Compute Unified Device Architecture. It is a parallel computing platform and application programming interface (API) model created by NVIDIA. CUDA allows developers to utilize the power of NVIDIA GPUs for general-purpose processing tasks, enabling them to accelerate computations that can benefit from parallel processing. 

In the context provided, CUDA is specifically mentioned in relation to different generations of NVIDIA GPU architectures designed for CUDA compute applications. These architectur

Enter your question about CUDA (or 'quit' to exit):  quit


In [73]:
import gradio as gr
from QALLM import answer
from VectorDatabase-Retrieval-Reranking import hybrid_search
def main():
    index_type = "FLAT"  # or "IVF"
    dense_index, metadata = load_index_and_metadata(index_type)  # Assuming these functions are defined elsewhere
    bm25_index = create_bm25_index(metadata)  # Assuming this function is defined elsewhere
    
    def inference(query):
        nonlocal dense_index, bm25_index, metadata
        results = hybrid_search(dense_index, bm25_index, metadata, query, k=5, alpha=0.5, use_query_expansion=True, use_prf=True)
        answer = answer_question(query, results)
        return answer

    iface = gr.Interface(
        fn=inference,
        inputs="text",
        outputs="text",
        title="CUDA Documentation Assistant",
        description="Ask a question about CUDA documentation.",
        theme="huggingface",
        examples=[["How to use CUDA with Python?"]],
    )
    iface.launch(share=True)

if __name__ == "__main__":
    main()











Sorry, we can't find the page you are looking for.


Running on local URL:  http://127.0.0.1:7874
Running on public URL: https://7a6481bb5c5242d47f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
