In [52]:
import re
import html
import unicodedata
import time
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.docstore.document import Document
from google.generativeai import configure, GenerativeModel

In [125]:
# Gemini API 
configure(api_key="")
model = GenerativeModel("gemini-1.5-pro")

In [126]:
def gemini_text_analysis(prompt):
    response = model.generate_content(prompt)
    return response.text.strip()


In [127]:
# Scrapping Website
def scrape_website(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=options)
    driver.get(url)

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        last_height = driver.execute_script("return document.body.scrollHeight")
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        driver.execute_script("""
            const elements = document.querySelectorAll('script, style, noscript, iframe');
            elements.forEach(el => el.remove());
        """)

        body = driver.find_element(By.TAG_NAME, "body")
        text = body.text

    except Exception as e:
        text = f"Error during scraping: {str(e)}"

    finally:
        driver.quit()

    return text

In [128]:
# Step 2: Preprocessing Text
def preprocess_text(raw_text):
    text = html.unescape(raw_text)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode()
    lines = text.splitlines()
    seen = set()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if not line or line in seen:
            continue
        seen.add(line)
        if len(line) < 10:
            continue
        if re.search(r"(copyright|privacy|terms|cookies|login|signup|subscribe)", line, re.I):
            continue
        if re.match(r"^[\d:\-/\s]+$", line):
            continue
        cleaned_lines.append(line)
    cleaned_text = "\n".join(cleaned_lines)
    return re.sub(r"\n{2,}", "\n", cleaned_text)

In [129]:
# Step 3: Splitting Cleaned Text into Chunks
def split_text_into_chunks(cleaned_text, max_len=5000, overlap=50):
    chunks = {}
    start = 0
    chunk_id = 1
    while start < len(cleaned_text):
        end = min(start + max_len, len(cleaned_text))
        buffer_start = max(0, start - overlap)
        buffer_end = min(len(cleaned_text), end + overlap)
        chunk = cleaned_text[buffer_start:buffer_end].strip()
        chunks[f"chunk_{chunk_id}"] = chunk
        chunk_id += 1
        start += max_len
    return chunks

In [130]:
# Storing Chunks in Vector DB
def store_chunks_in_vector_db(text_chunks):
    docs = [Document(page_content=chunk, metadata={"id": name}) for name, chunk in text_chunks.items()]
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_db = FAISS.from_documents(docs, embedding=embeddings)
    return vector_db

In [131]:
# Extracting key information and analyzing the documents
def analyze_chunks_from_vector_db(vector_db):
    summaries = []
    for doc in vector_db.docstore._dict.values():
        prompt = f"""
You are an expert content analyst. Carefully read the following portion of a webpage and provide a structured analysis.

### Section Content:
{doc.page_content}

### Instructions:
1. Identify the main themes or topics discussed.
2. Extract any specific names, organizations, tools, or locations ,i.e, Named Entities.
3. Explain the purpose or intent of this section in the broader context of the website.

### Output Format:
- Main Topics: ...
- Named Entities: ...
- Purpose/Intent: ...
"""
        response = gemini_text_analysis(prompt)
        summaries.append(response.strip())
    return summaries


In [132]:
# Generating Final Summary
def generate_final_summary_from_chunks(summaries):
    combined = "\n\n".join(summaries)
    final_prompt = f"""
You are a skilled summarization expert. You are provided with separate summaries of various website areas. Your task is to synthesize a final, coherent summary.

### Section-Wise Summaries:
{combined}

### Instructions:
- Remove redundant points.
- Merge related insights.
- Present the summary in a concise, clear and well-organized format.
- Give the full summary in one paragraph

### Final Summary:
"""
    return gemini_text_analysis(final_prompt)

In [133]:
# Full Pipeline
def full_pipeline(url):
    raw = scrape_website(url)
    cleaned = preprocess_text(raw)
    chunks = split_text_into_chunks(cleaned)
    vector_db = store_chunks_in_vector_db(chunks)
    summaries = analyze_chunks_from_vector_db(vector_db)
    final_summary = generate_final_summary_from_chunks(summaries)
    return final_summary


In [134]:
url = "https://en.wikipedia.org/wiki/Indian_Premier_League"
summary = full_pipeline(url)
print("\n Final Webpage Summary:\n")
print(summary)


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
violations {
}
violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 40
}
]