In [1]:
import re
import asyncio
from typing import List, Dict
from urllib.parse import urljoin, urlparse

import trafilatura
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import tiktoken


# =========================
# CONFIG
# =========================
BASE_URL = "https://www.pminterviewprep.club"  ## Pass the website of whom you wish you train the model on. In this version, I am scraping from the website. However, we can also train the model via various knowledge sources including PDFs, FAQs etc.,
MAX_PAGES = 30

CHUNK_SIZE = 500     # tokens
OVERLAP = 100        # tokens
ENCODING = tiktoken.get_encoding("cl100k_base")


# =========================
# LINK FILTERING
# =========================
def is_valid_internal_link(link: str) -> bool:
    if not link:
        return False

    parsed = urlparse(link)
    base_netloc = urlparse(BASE_URL).netloc

    if parsed.netloc and parsed.netloc != base_netloc:
        return False

    skip_keywords = [
        "login", "signup", "dashboard",
        "privacy", "terms", "auth",
        "#"
    ]

    return not any(k in link.lower() for k in skip_keywords)


# =========================
# PAGE FETCH
# =========================
async def fetch_page(page, url: str) -> str | None:
    try:
        await page.goto(url, timeout=20000)
        await page.wait_for_load_state("networkidle")
        return await page.content()
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None


# =========================
# CLEAN TEXT EXTRACTION
# =========================
def extract_clean_text(html: str) -> str:
    # --- Primary: Trafilatura (best for blogs/guides) ---
    text = trafilatura.extract(
        html,
        favor_recall=True,
        include_tables=True,
        include_comments=False
    )

    if text and len(text) > 400:
        return post_process(text)

    # --- Fallback: BeautifulSoup (marketing pages) ---
    soup = BeautifulSoup(html, "html.parser")

    for tag in soup([
        "script", "style", "noscript",
        "header", "footer", "nav",
        "aside", "form", "button",
        "svg", "iframe"
    ]):
        tag.decompose()

    for tag in soup.find_all(
        attrs={
            "class": re.compile(
                r"(nav|footer|header|menu|cookie|banner|cta|subscribe|pricing|popup)",
                re.I
            )
        }
    ):
        tag.decompose()

    text = soup.get_text(separator="\n")
    return post_process(text)


def post_process(text: str) -> str:
    # Normalize whitespace
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    cleaned_lines = []
    for line in text.split("\n"):
        line = line.strip()
        if len(line) < 30:
            continue
        if re.search(
            r"(sign up|subscribe|get started|join now|buy now|book a call)",
            line,
            re.I
        ):
            continue
        cleaned_lines.append(line)

    # Deduplicate consecutive lines
    deduped = []
    prev = None
    for line in cleaned_lines:
        if line != prev:
            deduped.append(line)
        prev = line

    return "\n".join(deduped).strip()


# =========================
# TOKEN-BASED CHUNKING
# =========================
def chunk_text(
    text: str,
    chunk_size: int = CHUNK_SIZE,
    overlap: int = OVERLAP
) -> List[str]:
    tokens = ENCODING.encode(text)
    chunks = []

    start = 0
    total = len(tokens)

    while start < total:
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk = ENCODING.decode(chunk_tokens).strip()

        if len(chunk) > 200:
            chunks.append(chunk)

        start += chunk_size - overlap

    return chunks


# =========================
# CRAWLER (RETURNS CHUNKS)
# =========================
async def crawl_site() -> List[Dict]:
    visited = set()
    to_visit = [BASE_URL]
    chunked_results: List[Dict] = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        while to_visit and len(visited) < MAX_PAGES:
            url = to_visit.pop(0)

            if url in visited:
                continue

            print(f"Crawling: {url}")
            visited.add(url)

            html = await fetch_page(page, url)
            if not html:
                continue

            text = extract_clean_text(html)
            if not text or len(text) < 500:
                continue

            chunks = chunk_text(text)

            for i, chunk in enumerate(chunks):
                chunked_results.append({
                    "url": url,
                    "chunk_id": i,
                    "text": chunk
                })

            soup = BeautifulSoup(html, "html.parser")
            for a in soup.find_all("a", href=True):
                link = urljoin(url, a["href"]).split("?")[0]
                if is_valid_internal_link(link) and link not in visited:
                    to_visit.append(link)

        await browser.close()

    return chunked_results


# =========================
# RUN (SCRIPT SAFE)
# =========================
if __name__ == "__main__":
    # Use this only when running as a script:
    results = await crawl_site()

    print(f"\nTotal chunks: {len(results)}\n")

    # Sanity check
    for r in results[:2]:
        print("URL:", r["url"])
        print("CHUNK:", r["text"][:500])
        print("-" * 80)


Crawling: https://www.pminterviewprep.club
Crawling: https://www.pminterviewprep.club/challenge/
Crawling: https://www.pminterviewprep.club/learn/
Crawling: https://www.pminterviewprep.club/jobs/
Crawling: https://www.pminterviewprep.club/resume/
Crawling: https://www.pminterviewprep.club/portfolio/
Crawling: https://www.pminterviewprep.club/community/
Crawling: https://www.pminterviewprep.club/mentorship/
Crawling: https://www.pminterviewprep.club/
Crawling: https://www.pminterviewprep.club/blog/
Crawling: https://www.pminterviewprep.club/resources/
Crawling: https://www.pminterviewprep.club/faq/
Crawling: https://www.pminterviewprep.club/careers/
Crawling: https://www.pminterviewprep.club/subscription/
Crawling: https://www.pminterviewprep.club/refund-policy/
Crawling: https://www.pminterviewprep.club/career-clarity/
Crawling: https://www.pminterviewprep.club/guides/
Crawling: https://www.pminterviewprep.club/guides/how-to-get-hired-at-an-indian-startup-the-inside-scoop/
Crawling: ht

In [2]:
import os
os.environ["GOOGLE_API_KEY"] = ## Pass your Google gemini API key


In [5]:
import os
import numpy as np
import faiss
from typing import List, Dict
import google.generativeai as genai

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

EMBEDDING_DIM = 768  # Gemini embedding size


class VectorStore:
    def __init__(self):
        self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
        self.metadata: List[Dict] = []

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in texts:
            res = genai.embed_content(
                model="models/text-embedding-004",
                content=text,
                task_type="retrieval_document"
            )
            embeddings.append(res["embedding"])
        return embeddings

    def add_chunks(self, chunks: List[Dict]):
        texts = [c["text"] for c in chunks]
        vectors = np.array(self.embed_texts(texts)).astype("float32")
        self.index.add(vectors)
        self.metadata.extend(chunks)

    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        q = genai.embed_content(
            model="models/text-embedding-004",
            content=query,
            task_type="retrieval_query"
        )["embedding"]

        q_vec = np.array([q]).astype("float32")
        _, idxs = self.index.search(q_vec, top_k)
        return [self.metadata[i] for i in idxs[0]]


In [17]:
import google.generativeai as genai

# model = genai.GenerativeModel("gemini-1.5-pro")
model = genai.GenerativeModel("models/gemini-2.5-flash")

def answer_with_gemini(query: str, context_chunks):
    context = "\n\n".join([c["text"] for c in context_chunks])

    prompt = f"""
You are a website assistant.
Answer ONLY from the context below.
If the answer is not present, say "I don't know".

Context:
{context}

Question:
{query}
"""

## You can change the prompt as required. I have used a very basic prompt
    
    response = model.generate_content(prompt)
    return response.text

In [26]:
store = VectorStore()
store.add_chunks(results)   # results from your crawler

query = "I a senior product manager. Is this relevant for me?"   ## Pass your query here

hits = store.search(query)
print(answer_with_gemini(query, hits))


Yes, many parts of the context are relevant for a senior product manager.

The context mentions:
*   "with 7 years of experience submitted."
*   Connecting with "experienced Product Leaders from top tech companies."
*   "1:1 sessions tailored to your goals and prep stage" and building a "career roadmap that fits your story and aspirations."
*   Strategies for applying to companies, such as writing a Product Strategy Memo, which "secured a call directly with the founders" for a mentee.
*   The "How to Be Valuable" framework (Acquire New Users, Increase Engagement/Retention, Improve Unit Economics) is relevant for strategic thinking at all levels.
*   Guidance on "Mastering Outreach & The First Call," "Finding & Pitching the Right Opportunities," and "16 Ways to Stand Out."
