<a href="https://colab.research.google.com/github/sooryendhu/Steps_ai/blob/main/steps_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install requests beautifulsoup4




In [1]:
# WEB SCRAPING/CRAWLING

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

def get_links(url, level, max_depth, visited):
    if level > max_depth or url in visited:
        return []

    visited.add(url)
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a.get('href') for a in soup.find_all('a', href=True)]
    full_links = [urljoin(url, link) for link in links]

    return full_links

def scrape_website(start_url, max_depth):
    visited = set()
    to_visit = [(start_url, 0)]
    scraped_data = []

    while to_visit:
        current_url, level = to_visit.pop(0)
        if current_url in visited:
            continue

        print(f"Scraping {current_url} at level {level}")
        visited.add(current_url)

        try:
            response = requests.get(current_url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching {current_url}: {e}")
            continue

        page_content = response.text
        scraped_data.append({
            'url': current_url,
            'content': page_content,
        })

        if level < max_depth:
            links = get_links(current_url, level, max_depth, visited)
            to_visit.extend([(link, level + 1) for link in links])


        time.sleep(1)

    return scraped_data

if __name__ == "__main__":
    start_url = "https://docs.nvidia.com/cuda/"
    max_depth = 5
    data = scrape_website(start_url, max_depth)


    with open("scraped_data.json", "w") as f:
        import json
        json.dump(data, f, indent=4)

    print("Scraping completed. Data saved to scraped_data.json.")


Scraping https://docs.nvidia.com/cuda/ at level 0
Scraping completed. Data saved to scraped_data.json.


In [4]:
import json

with open("scraped_data.json", "r") as f:
    data = json.load(f)


for entry in data[:5]:
    print(f"URL: {entry['url']}")
    print(f"Content snippet: {entry['content'][:200]}...")  # Print the first 200 characters
    print("\n")


URL: https://docs.nvidia.com/cuda/
Content snippet: <!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />

  <meta name="viewpo...




In [24]:
#DATA CHUNKING

In [5]:
import json
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')


def get_embeddings(sentences):
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()


with open("scraped_data.json", "r") as f:
    data = json.load(f)

chunks = []


def chunk_data(content):
    sentences = content.split('. ')
    embeddings = get_embeddings(sentences)

    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5).fit(embeddings)
    labels = clustering.labels_

    chunk_dict = {}
    for label, sentence in zip(labels, sentences):
        if label not in chunk_dict:
            chunk_dict[label] = []
        chunk_dict[label].append(sentence)

    for label, sentences in chunk_dict.items():
        chunks.append(' '.join(sentences))

for entry in data:
    chunk_data(entry['content'])


with open("chunks.json", "w") as f:
    json.dump(chunks, f, indent=4)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [25]:
#DATA EMBEDDING

In [6]:

chunk_embeddings = get_embeddings(chunks)


np.save("chunk_embeddings.npy", chunk_embeddings)


In [7]:
pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [26]:
#VECTORIZATION

In [8]:
import faiss
import numpy as np
import json


chunk_embeddings = np.load("chunk_embeddings.npy")


urls = [entry['url'] for entry in data for _ in range(len(chunks))]


with open("metadata.json", "w") as f:
    json.dump(urls, f, indent=4)


dimension = chunk_embeddings.shape[1]
index = faiss.IndexHNSWFlat(dimension, 32)

# Add the embeddings to the index
index.add(chunk_embeddings)

# Save the index to a file
faiss.write_index(index, "faiss_index.idx")

print("FAISS index created and saved to faiss_index.idx")


FAISS index created and saved to faiss_index.idx


In [27]:
#RETRIVAL AND RE RANKING

In [9]:
import faiss
import numpy as np
import json

# Load the FAISS index
index = faiss.read_index("faiss_index.idx")

# Load the metadata
with open("metadata.json", "r") as f:
    urls = json.load(f)

# Function to perform a similarity search
def search(query, k=5):
    query_embedding = get_embeddings([query])[0]
    query_embedding = np.expand_dims(query_embedding, axis=0)
    distances, indices = index.search(query_embedding, k)
    results = []
    for idx in indices[0]:
        results.append(urls[idx])
    return results

# Example usage
query = "CUDA installation instructions"
results = search(query)
print("Top 5 similar chunks:")
for result in results:
    print(result)


Top 5 similar chunks:
https://docs.nvidia.com/cuda/
https://docs.nvidia.com/cuda/
https://docs.nvidia.com/cuda/
https://docs.nvidia.com/cuda/
https://docs.nvidia.com/cuda/


In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load pre-trained BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def expand_query(query, top_k=5):
    # Tokenize and encode the query
    inputs = tokenizer(query, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(dim=1).numpy()

    # Assuming `embedding_index` contains embeddings for possible synonyms or related terms
    distances, indices = index.search(query_embedding, top_k)
    expanded_queries = [index_to_query[idx] for idx in indices[0]]
    return expanded_queries


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample documents for TF-IDF
documents = chunks  # Assuming chunks contains your text data

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

def tfidf_retrieve(query, top_k=5):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = np.dot(tfidf_matrix, query_vector.T).toarray().flatten()
    top_indices = cosine_similarities.argsort()[-top_k:][::-1]
    return [(idx, cosine_similarities[idx]) for idx in top_indices]


In [12]:
import faiss

# Load the FAISS index
index = faiss.read_index("faiss_index.idx")

# Function to get dense retrieval results
def dense_retrieve(query, top_k=5):
    query_embedding = get_embeddings([query])[0]
    query_embedding = np.expand_dims(query_embedding, axis=0)
    distances, indices = index.search(query_embedding, top_k)
    return [(idx, distances[0][i]) for i, idx in enumerate(indices[0])]


In [13]:
def retrieve_with_hybrid(query, top_k=5):
    # Retrieve using TF-IDF
    tfidf_results = tfidf_retrieve(query, top_k)

    # Retrieve using FAISS
    dense_results = dense_retrieve(query, top_k)

    # Combine results
    combined_results = tfidf_results + dense_results

    # Optionally re-rank combined results here if needed
    return sorted(combined_results, key=lambda x: -x[1])


In [14]:
import numpy as np

# Load the precomputed chunk embeddings
dpr_embeddings = np.load('/content/chunk_embeddings.npy')


In [15]:
def rerank_results(results, query_embedding):
    rerank_scores = []
    for idx, _ in results:
        idx = int(idx)  # Ensure proper type conversion
        doc_embedding = dpr_embeddings[idx]
        # Compute cosine similarity
        similarity = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        rerank_scores.append((idx, similarity))

    return sorted(rerank_scores, key=lambda x: -x[1])


In [16]:
pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [17]:
import numpy as np


embeddings = np.load('chunk_embeddings.npy')

def query_to_embedding(query):
    inputs = tokenizer(query, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def retrieve_with_hybrid(query, top_k=5):
    query_embedding = query_to_embedding(query)
    distances, indices = index.search(query_embedding, top_k)
    return indices, distances

def rerank_results(indices, query_embedding):
    rerank_scores = []
    for idx in indices:
        doc_embedding = embeddings[int(idx)]
        similarity = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        rerank_scores.append((idx, float(similarity)))
    return sorted(rerank_scores, key=lambda x: -x[1])

query = "CUDA installation instructions"
indices, distances = retrieve_with_hybrid(query)


indices_flat = indices.flatten()
query_embedding = query_to_embedding(query)


re_ranked_results = rerank_results(indices_flat, query_embedding)

print("Top results after re-ranking:")
for idx, score in re_ranked_results[:5]:
    print(f"Document ID: {int(idx)}, Score: {score:.4f}")


Top results after re-ranking:
Document ID: 4, Score: 0.6993
Document ID: 19, Score: 0.6928
Document ID: 38, Score: 0.6911
Document ID: 41, Score: 0.6839
Document ID: 8, Score: 0.6838


  rerank_scores.append((idx, float(similarity)))


In [18]:
import json


file_path = '/content/chunks.json'


with open(file_path, 'r') as file:
    chunk_data = json.load(file)


print("Type of chunk_data:", type(chunk_data))
print("Sample content of chunk_data:", chunk_data[:5] if isinstance(chunk_data, list) else chunk_data)


Type of chunk_data: <class 'list'>
Sample content of chunk_data: ['<!DOCTYPE html>\r\n<html class="writer-html5" lang="en" >\r\n<head>\r\n  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />\r\n\r\n  <meta name="viewport" content="width=device-width, initial-scale=1.0" />\r\n  <title>CUDA Toolkit Documentation 12.5</title>\r\n      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />\r\n      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />\r\n      <link rel="stylesheet" href="_static/design-style.b7bb847fb20b106c3d81b95245e65545.min.css" type="text/css" />\r\n      <link rel="stylesheet" href="_static/omni-style.css" type="text/css" />\r\n      <link rel="stylesheet" href="_static/api-styles.css" type="text/css" />\r\n    <link rel="shortcut icon" href="_static/favicon.ico"/>\r\n  <!--[if lt IE 9]>\r\n    <script src="_static/js/html5shiv.min.js"></script>\r\n  <![endif]-->\r\n  \r\n    

In [19]:
import numpy as np


embeddings_path = '/content/chunk_embeddings.npy'
embeddings = np.load(embeddings_path)

print(f"Loaded embeddings shape: {embeddings.shape}")


Loaded embeddings shape: (48, 768)


In [28]:
#QUESTION-ANSWERING

In [20]:
import numpy as np
import json


embeddings = np.load('/content/chunk_embeddings.npy')


with open('/content/chunks.json', 'r') as file:
    chunk_data = json.load(file)


print(f"Number of chunks: {len(chunk_data)}")

def query_to_embedding(query):

    inputs = tokenizer(query, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def retrieve_with_hybrid(query, top_k=5):
    query_embedding = query_to_embedding(query)
    distances, indices = index.search(query_embedding, top_k)
    return indices.flatten(), distances

def rerank_results(indices, query_embedding):
    rerank_scores = []
    for idx in indices:
        doc_embedding = embeddings[int(idx)]
        similarity = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        rerank_scores.append((int(idx), float(similarity)))  # Convert idx to int
    return sorted(rerank_scores, key=lambda x: -x[1])


query = "CUDA installation instructions"


indices, distances = retrieve_with_hybrid(query)


indices_flat = indices.flatten()
query_embedding = query_to_embedding(query)


re_ranked_results = rerank_results(indices_flat, query_embedding)


print("Top results after re-ranking:")
for idx, score in re_ranked_results[:5]:
    idx = int(idx)
    print(f"Document ID: {idx}, Score: {score:.4f}")
    print(f"Content: {chunk_data[idx]}")
    print()


Number of chunks: 48
Top results after re-ranking:
Document ID: 4, Score: 0.6993
Content: The appendices include a list of all CUDA-enabled devices, detailed description of all extensions to the C++ language, listings of supported mathematical functions, C++ features supported in host and device code, details on texture fetching, technical specifications of various devices, and concludes by introducing the low-level driver API.</p>
</dd>
<dt><a class="reference internal" href="cuda-c-best-practices-guide/index.html"><span class="doc">Best Practices Guide</span></a></dt><dd><p>This guide presents established parallelization and optimization techniques and explains coding metaphors and idioms that can greatly simplify programming for CUDA-capable GPU architectures

Document ID: 19, Score: 0.6928
Content: This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by leveraging Turing architectural features.</p>
</dd>
<dt><a class="reference internal

  rerank_scores.append((int(idx), float(similarity)))  # Convert idx to int


In [21]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits) + 1
    answer_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_index:end_index])
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    return answer


top_results = [
    ("The appendices include a list of all CUDA-enabled devices, detailed description of all extensions to the C++ language, listings of supported mathematical functions, C++ features supported in host and device code, details on texture fetching, technical specifications of various devices, and concludes by introducing the low-level driver API.", 0.6993),
    ("This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by leveraging Turing architectural features.", 0.6928),
    ("nvcc accepts a range of conventional compiler options, such as for defining macros and include/library paths, and for steering the compilation process.", 0.6911),
    ("The purpose of this white paper is to discuss the most common issues related to NVIDIA GPUs and to supplement the documentation in the CUDA C++ Programming Guide.", 0.6839),
    ("This document provides guidance to ensure that your software applications are compatible with Turing.", 0.6838)
]

query = "What are the CUDA installation instructions?"


answers = [answer_question(query, content) for content, _ in top_results]


print("Answers to the query:")
for idx, answer in enumerate(answers):
    print(f"Answer {idx + 1}: {answer}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Answers to the query:
Answer 1: low - level driver api
Answer 2: turing architectural features
Answer 3: defining macros and include / library paths
Answer 4: c + + programming guide
Answer 5: guidance to ensure that your software applications are compatible with turing


In [22]:
pip install gradio

Collecting gradio
  Downloading gradio-4.38.1-py3-none-any.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting altair<6.0,>=5.0 (from gradio)
  Downloading altair-5.3.0-py3-none-any.whl (857 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.8/857.8 kB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi (from gradio)
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.1.0 (from gradio)
  Downloading gradio_client-1.1.0-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [23]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits) + 1
    answer_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_index:end_index])
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    return answer


top_results = [
    ("The appendices include a list of all CUDA-enabled devices, detailed description of all extensions to the C++ language, listings of supported mathematical functions, C++ features supported in host and device code, details on texture fetching, technical specifications of various devices, and concludes by introducing the low-level driver API.", 0.6993),
    ("This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by leveraging Turing architectural features.", 0.6928),
    ("nvcc accepts a range of conventional compiler options, such as for defining macros and include/library paths, and for steering the compilation process.", 0.6911),
    ("The purpose of this white paper is to discuss the most common issues related to NVIDIA GPUs and to supplement the documentation in the CUDA C++ Programming Guide.", 0.6839),
    ("This document provides guidance to ensure that your software applications are compatible with Turing.", 0.6838)
]

def gradio_interface(query):

    answers = [answer_question(query, content) for content, _ in top_results]
    return answers


iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(label="Enter your question"),
    outputs=gr.Textbox(label="Answers"),
    title="CUDA Documentation QA",
    description="Enter your question to get answers from CUDA documentation based on the most relevant sections."
)


iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://fd3b86a6839969174c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


