<a href="https://colab.research.google.com/github/saiharshith426/Project-Tasks/blob/main/Sitafal_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 langchain langchain-community faiss-cpu transformers


In [None]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from transformers import pipeline
import os

# Configure Hugging Face API token for embeddings
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "Hugging_face_API_KEY"

# Function to fetch website content
def fetch_website_content(url):
    response = requests.get(url)
    parsed_html = BeautifulSoup(response.text, 'html.parser')
    text_data = parsed_html.get_text()  # Extract the raw text content
    return text_data

# Collect content from websites
web_pages = [
    {"content": fetch_website_content("https://www.uchicago.edu/"), "source": "University of Chicago"},
    {"content": fetch_website_content("https://www.washington.edu/"), "source": "University of Washington"},
    {"content": fetch_website_content("https://www.stanford.edu/"), "source": "Stanford University"},
    {"content": fetch_website_content("https://und.edu/"), "source": "University of North Dakota"}
]

# Break the content into smaller sections
content_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_content_list = []
for page in web_pages:
    sections = content_splitter.split_text(page['content'])
    for section in sections:
        split_content_list.append(Document(page_content=section, metadata={"source": page['source']}))

# Initialize the embedding model
embedding_model = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Build the vector database
vector_database = FAISS.from_documents(split_content_list, embedding_model)

# Initialize the summarization pipeline
text_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Get the query from the user
user_query = input("Please enter your query: ")

# Search for relevant sections
retrieved_documents = vector_database.similarity_search(user_query, k=3)

# Combine and summarize the relevant content
combined_content = " ".join([doc.page_content for doc in retrieved_documents])
result_summary = text_summarizer(combined_content, max_length=150, min_length=50, do_sample=False)

# Display the final summary
print("\nResponse Summary:")
print(result_summary[0]['summary_text'])
