In [None]:
# === STEP 0: Install dependencies ===
!pip install -q sentence-transformers faiss-cpu scikit-learn langchain google-generativeai requests beautifulsoup4

# === STEP 1: Imports ===
import os
import pickle
import numpy as np
import re
from collections import Counter
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import google.generativeai as genai
import faiss


# === STEP 2: Mount Drive for agent caching ===
from google.colab import drive
drive.mount('/content/drive')
CACHE_DIR = "/content/drive/MyDrive/agents_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

# === STEP 3: Initialize models ===
embedder = SentenceTransformer('all-MiniLM-L6-v2')
genai.configure(api_key="AIzaSyAtYlN4MV__hc3_pLrDhjH45_aaUEgkdpc")  # Replace with your API key
model = genai.GenerativeModel("models/gemini-2.5-flash")

# === STEP 4: Utility functions ===

def clean_text(text):
    return re.findall(r'\b\w+\b', text.lower())

def get_top_keywords(texts, top_n=3):
    words = []
    for t in texts:
        words.extend(clean_text(t))
    common = Counter(words).most_common(top_n)
    return ", ".join([w for w, _ in common])

def extract_text_from_url(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = soup.get_text(separator=' ')
        return ' '.join(text.split()), soup
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return "", None

def cot_scrape(start_url, max_pages=10):
    to_visit = [start_url]
    visited = set()
    all_texts = []
    domain = urlparse(start_url).netloc
    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)
        if url in visited:
            continue
        print(f"Scraping: {url}")
        text, soup = extract_text_from_url(url)
        if text:
            all_texts.append(text)
        visited.add(url)
        if soup:
            for link in soup.find_all('a', href=True):
                href = link['href']
                full_url = urljoin(url, href)
                parsed_url = urlparse(full_url)
                if parsed_url.netloc == domain and full_url not in visited and full_url not in to_visit:
                    to_visit.append(full_url)
    return all_texts

def create_faiss_index(texts):
    docs = [Document(page_content=t) for t in texts]
    embeddings = embedder.encode(texts, show_progress_bar=True)
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype('float32'))
    return index, docs

def save_agent(domain, index, docs):
    safe_domain = domain.replace(" ", "_")
    path = os.path.join(CACHE_DIR, f"{safe_domain}_agent.pkl")
    with open(path, "wb") as f:
        pickle.dump({"index": index, "docs": docs}, f)
    print(f"Agent saved: {domain} → {path}")

def load_all_agents(agent_dir):
    index_map, docs_map = {}, {}
    for filename in os.listdir(agent_dir):
        if filename.endswith("_agent.pkl"):
            domain = filename.replace("_agent.pkl", "").replace("_", " ")
            with open(os.path.join(agent_dir, filename), "rb") as f:
                data = pickle.load(f)
            index_map[domain] = data["index"]
            docs_map[domain] = data["docs"]
    return index_map, docs_map

def select_relevant_agents(user_question, domains, threshold=0.75):
    q_vec = embedder.encode([user_question])[0]
    domain_vecs = embedder.encode(domains)
    sims = [np.dot(q_vec, dv) / (np.linalg.norm(q_vec) * np.linalg.norm(dv) + 1e-10) for dv in domain_vecs]
    ranked = sorted(zip(domains, sims), key=lambda x: x[1], reverse=True)
    selected = [d for d, s in ranked if s >= threshold]
    if not selected:
        selected = [d for d, _ in ranked[:2]]
    return selected

def search_documents(index, docs, query, top_k=3, alpha=0.7):
    q_vec = embedder.encode([query])[0].astype("float32")
    D, I = index.search(q_vec.reshape(1, -1), top_k * 3)
    results = []
    query_keywords = set(clean_text(query))
    for idx in I[0]:
        if 0 <= idx < len(docs):
            doc = docs[idx]
            doc_keywords = set(clean_text(doc.page_content))
            keyword_score = len(query_keywords & doc_keywords) / (len(query_keywords) + 1e-5)
            semantic_score = 1.0 / (D[0][list(I[0]).index(idx)] + 1e-5)
            hybrid_score = alpha * semantic_score + (1 - alpha) * keyword_score
            results.append((hybrid_score, doc))
    results = sorted(results, key=lambda x: x[0], reverse=True)
    return [doc for _, doc in results[:top_k]]

def generate_answer(domain, context, query):
    prompt = f"""You are a helpful assistant specialized in the {domain} domain.
Use only the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:"""
    response = model.generate_content(prompt)
    return response.text.strip()

def synthesize_combined_answer(agent_answers, user_question):
    combined_text = "\n\n".join([
        f"[{domain} Agent]: {ans}\nConfidence: {confidence:.2f}"
        for domain, ans, confidence in agent_answers
    ])
    prompt = f"""
You are an expert assistant that received the following answers from multiple specialized agents for the user's question.

User question: {user_question}

Agent answers: {combined_text}

Compare the answers step by step. Rank them based on relevance and correctness. Then synthesize a final answer by selecting the best one or merging them into a superior response.

Return your reasoning, ranked list, and final answer.
"""
    response = model.generate_content(prompt)
    return response.text.strip()

# === STEP 5: Main interactive logic ===

def create_agents_from_csv(filename):
    with open(filename, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.read().splitlines()
    block_size = 5
    all_texts = [
        " ".join(lines[i:i + block_size]).strip()
        for i in range(0, len(lines), block_size)
        if " ".join(lines[i:i + block_size]).strip()
    ]
    print(f"Parsed {len(all_texts)} blocks from CSV.")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    docs = splitter.create_documents(all_texts)
    print(f"Created {len(docs)} documents.")
    doc_texts = [doc.page_content for doc in docs]
    embeddings = embedder.encode(doc_texts, show_progress_bar=True)
    num_clusters = min(5, len(docs))
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    domain_names = []
    for cluster_id in range(num_clusters):
        cluster_indices = [i for i, label in enumerate(labels) if label == cluster_id]
        cluster_texts = [doc_texts[i] for i in cluster_indices]
        cluster_docs = [docs[i] for i in cluster_indices]
        domain_name = get_top_keywords(cluster_texts) or f"Domain_{cluster_id}"
        domain_names.append(domain_name)
        cluster_embeddings = [embeddings[i] for i in cluster_indices]
        dim = cluster_embeddings[0].shape[0]
        index = faiss.IndexFlatL2(dim)
        index.add(np.array(cluster_embeddings).astype("float32"))
        save_agent(domain_name, index, cluster_docs)
    print(f"Created agents for domains: {domain_names}")

def create_agents_from_url(url):
    all_texts = cot_scrape(url)
    if not all_texts:
        print("No content scraped from URL.")
        return
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    docs = splitter.create_documents(all_texts)
    print(f"Created {len(docs)} documents from URL scraping.")
    doc_texts = [doc.page_content for doc in docs]
    embeddings = embedder.encode(doc_texts, show_progress_bar=True)
    num_clusters = min(5, len(docs))
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    domain_names = []
    for cluster_id in range(num_clusters):
        cluster_indices = [i for i, label in enumerate(labels) if label == cluster_id]
        cluster_texts = [doc_texts[i] for i in cluster_indices]
        cluster_docs = [docs[i] for i in cluster_indices]
        domain_name = get_top_keywords(cluster_texts) or f"Domain_{cluster_id}"
        domain_names.append(domain_name)
        cluster_embeddings = [embeddings[i] for i in cluster_indices]
        dim = cluster_embeddings[0].shape[0]
        index = faiss.IndexFlatL2(dim)
        index.add(np.array(cluster_embeddings).astype("float32"))
        save_agent(domain_name, index, cluster_docs)
    print(f"Created agents for domains: {domain_names}")

def interactive_qa():
    index_map, docs_map = load_all_agents(CACHE_DIR)
    if not index_map:
        print("No agents found in cache. Please create agents first from CSV or URL.")
        return
    chat_history = []
    while True:
        question = input("\nAsk your question (or 'exit' to quit): ").strip()
        if question.lower() == "exit":
            print("Goodbye!")
            break
        selected_domains = select_relevant_agents(question, list(index_map.keys()))
        agent_answers = []
        for domain in selected_domains:
            index = index_map[domain]
            docs = docs_map[domain]
            top_docs = search_documents(index, docs, question)
            context = "\n\n".join(doc.page_content for doc in top_docs)
            answer = generate_answer(domain, context, question)
            agent_answers.append((domain, answer, 0.9))  # skipping confidence scoring for now
        final_answer = synthesize_combined_answer(agent_answers, question)
        chat_history.append(("You", question))
        chat_history.append(("Combined Insight", final_answer))
        print("\n🤖 Combined Insight:\n", final_answer)

# === USAGE ===
print("Welcome! To create agents from CSV, call: create_agents_from_csv('yourfile.csv')")
print("To create agents from URL, call: create_agents_from_url('https://example.com')")
print("After agents are created, call interactive_qa() to chat with your multi-agent system.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Welcome! To create agents from CSV, call: create_agents_from_csv('yourfile.csv')
To create agents from URL, call: create_agents_from_url('https://example.com')
After agents are created, call interactive_qa() to chat with your multi-agent system.


In [None]:
create_agents_from_csv('kizen_resources.csv')

Parsed 18262 blocks from CSV.
Created 24753 documents.


Batches:   0%|          | 0/774 [00:00<?, ?it/s]

Agent saved: and, to, the → /content/drive/MyDrive/agents_cache/and,_to,_the_agent.pkl
Agent saved: and, ai, the → /content/drive/MyDrive/agents_cache/and,_ai,_the_agent.pkl
Agent saved: kizen, com, https → /content/drive/MyDrive/agents_cache/kizen,_com,_https_agent.pkl
Agent saved: kizen, com, 1 → /content/drive/MyDrive/agents_cache/kizen,_com,_1_agent.pkl
Agent saved: 2023, 2022, kizen → /content/drive/MyDrive/agents_cache/2023,_2022,_kizen_agent.pkl
Created agents for domains: ['and, to, the', 'and, ai, the', 'kizen, com, https', 'kizen, com, 1', '2023, 2022, kizen']


In [None]:
interactive_qa()


Ask your question (or 'exit' to quit): Who is the ceo of Kizen?

🤖 Combined Insight:
 **Reasoning:**

The user is asking a specific factual question: "Who is the CEO of Kizen?".
*   **Agent 1 ([and, kizen, to Agent]):** States that the information about the CEO is "not available" based on its provided context. It then provides tangential information about other individuals associated with Kizen as users. While this agent is honest about its limitations, it fails to answer the user's core question.
*   **Agent 2 ([kizen, com, 1 Agent]):** Directly provides a specific name, "John Winner," as the CEO of Kizen. This agent directly addresses the user's question with a definitive answer.

Agent 2 is more relevant and useful as it provides a direct answer to the user's query, whereas Agent 1 indicates a lack of information within its specific dataset. Assuming the information provided by Agent 2 is accurate (which is implied by its direct statement), it is superior.

**Ranked List:**

1.  **