<a href="https://colab.research.google.com/github/pokasanthi/Sithafal-tasks/blob/main/sithafal_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install requests beautifulsoup4 sentence-transformers faiss-cpu transformers




In [7]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# 1. Function to fetch and parse content from websites
def fetch_website_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            text = " ".join([p.get_text() for p in soup.find_all("p")])  # Extract paragraphs
            return text
        else:
            print(f"Failed to fetch {url}: {response.status_code}")
            return ""
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

# List of example websites
websites = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

# Fetch content from all websites
website_texts = []
for url in websites:
    print(f"Fetching content from: {url}")
    text = fetch_website_text(url)
    if text:
        website_texts.append((url, text[:1000]))  # Limit text length for simplicity
        print(f"Fetched {len(text)} characters from {url}\n")

# 2. Chunk the text for embeddings
def chunk_text(data, max_length=512):
    chunks = []
    for url, text in data:
        for i in range(0, len(text), max_length):
            chunk = text[i:i+max_length]
            if len(chunk) > 50:  # Only include meaningful chunks
                chunks.append((url, chunk))
    return chunks

# Chunk the website content
chunked_data = chunk_text(website_texts)
print(f"Total chunks created: {len(chunked_data)}")

# 3. Generate embeddings for each chunk
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(chunked_data):
    embeddings = []
    for url, chunk in chunked_data:
        emb = model.encode([chunk])[0]
        embeddings.append((url, emb))
    return embeddings

embeddings_with_sources = generate_embeddings(chunked_data)

# Prepare embeddings for FAISS
embeddings = np.array([emb for _, emb in embeddings_with_sources]).astype('float32')
sources = [url for url, _ in embeddings_with_sources]

# 4. Store embeddings in FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
index.add(embeddings)
print(f"FAISS index created with {index.ntotal} embeddings.")

# Save sources (to map indices back to websites)
np.save("sources.npy", sources)

# 5. Query and search the FAISS index
def query_and_search(query, k=3):
    query_emb = model.encode([query]).astype('float32')
    distances, indices = index.search(query_emb, k)
    return distances, indices

# 6. Summarize the top-k results
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_results(query, k=3):
    distances, indices = query_and_search(query, k)
    sources = np.load("sources.npy", allow_pickle=True)

    # Retrieve and print top-k results
    top_chunks = []
    for i in indices[0]:
        if i < len(chunked_data):
            url, chunk = chunked_data[i]
            top_chunks.append(chunk)
            print(f"Source: {url}\nContent: {chunk[:200]}...\n")

    # Summarize combined top-k chunks
    combined_text = " ".join(top_chunks)
    summary = summarizer(combined_text, max_length=130, min_length=30, do_sample=False)
    print("\n**Summary of Retrieved Content:**")
    print(summary[0]['summary_text'])

# 7. Example Query
query = "What can I learn about universities?"
print(f"Query: {query}")
summarize_results(query)


Fetching content from: https://www.uchicago.edu/
Failed to fetch https://www.uchicago.edu/: 403
Fetching content from: https://www.washington.edu/
Fetched 1416 characters from https://www.washington.edu/

Fetching content from: https://www.stanford.edu/
Fetched 3519 characters from https://www.stanford.edu/

Fetching content from: https://und.edu/
Fetched 2753 characters from https://und.edu/

Total chunks created: 6
FAISS index created with 6 embeddings.
Query: What can I learn about universities?
Source: https://und.edu/
Content: The University of North Dakota is the state's oldest and largest university. We offer
                           225+ highly accredited on-campus and online degrees. Explore the causes and impact of c...

Source: https://www.stanford.edu/
Content:  Farm Science & Engineering Health & Medicine Science & Engineering Awards Science & Engineering Science & Engineering Preparing students to make meaningful contributions to society as engaged citizen...

Source: h