<a href="https://colab.research.google.com/github/sowjanyasajibilli/SithafalTask2/blob/main/SithafalTask2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""Chat with Website Using RAG Pipeline.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1GgnRSWQ9R4WnyDc0OjQZaRgG9-xVx7vO

# 2. Chat with Website Using RAG Pipeline
"""

!pip install requests beautifulsoup4 sentence-transformers faiss-cpu

import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss

# API Key
GEMINI_API_KEY = 'AIzaSyBBFvoMfsrh3k28R7f38HlaBIDbzqnsotc'

# Gemini Authentication
def auth_gemini():
    headers = {"Content-Type": "application/json", "X-GEMINI-APIKEY": GEMINI_API_KEY}
    res = requests.get("https://api.gemini.com/v1/pubticker/btcusd", headers=headers)
    if res.status_code != 200:
        raise Exception(f"Auth failed: {res.json()}")
    print("Authenticated with Gemini!")

# Fetch website content
def fetch_url(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        raise Exception(f"Failed to fetch {url}: {res.status_code}")
    soup = BeautifulSoup(res.text, "html.parser")
    return " ".join([p.get_text() for p in soup.find_all("p")])

# Split text into chunks
def split_text(txt, size=300):
    words = txt.split()
    return [' '.join(words[i:i+size]) for i in range(0, len(words), size)]

# Create embeddings and store in FAISS
model = SentenceTransformer('all-MiniLM-L6-v2')

def create_embeddings(chunks):
    emb = model.encode(chunks)
    dim = emb.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(emb)
    return index, emb

# Search in FAISS
def search_query(q, index, chunks, top_k=5):
    q_vec = model.encode([q])
    dists, idxs = index.search(q_vec, top_k)
    return [chunks[i] for i in idxs[0]]

# Generate response
def gen_response(q, ctxt):
    return f"Query: {q}\n\nContext:\n{ctxt}"

# Main workflow
try:
    # Authenticate
    auth_gemini()

    # Websites to scrape
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    # Scrape websites
    all_chunks = []
    for url in urls:
        print(f"Fetching: {url}")
        txt = fetch_url(url)
        chunks = split_text(txt)
        all_chunks.extend(chunks)

    # Store embeddings
    index, embeddings = create_embeddings(all_chunks)

    # Query
    query = input("Enter your query: ")
    retrieved = search_query(query, index, all_chunks)

    # Generate and print response
    context = "\n".join(retrieved)
    print("\nResponse:")
    print(gen_response(query, context))

except Exception as e:
    print("Error:", e)


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Authenticated successfully with Gemini API!
Scraping: https://www.uchicago.edu/
Scraping: https://www.washington.edu/
Scraping: https://www.stanford.edu/
Scraping: https://und.edu/
