In [None]:
!pip install openai requests beautifulsoup4 PyPDF2 gradio matplotlib pandas


In [None]:
!pip install readability-lxml


In [None]:
!pip install faiss-cpu


In [None]:
!pip install playwright
!playwright install


In [None]:
# STEP 1: Install required packages (ensure these are installed once)
# If running on Colab/Kaggle or a fresh environment, run these in your notebook/terminal:
# !pip install openai transformers requests beautifulsoup4 PyPDF2 gradio matplotlib pandas readability-lxml faiss-cpu sentence-transformers playwright openpyxl
# After installing playwright, you MUST also install its browser binaries:
# !playwright install

# STEP 2: Imports and setup
from openai import OpenAI
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
from readability import Document
from PyPDF2 import PdfReader
from urllib.parse import urljoin, urlparse
import io, base64
import pandas as pd
import matplotlib.pyplot as plt
import gradio as gr
from sentence_transformers import SentenceTransformer
import faiss
import textwrap
import torch
from playwright.sync_api import sync_playwright
import threading
import time
from concurrent.futures import ThreadPoolExecutor
import os
import json

# --- Define paths for cached data ---
# These will now be product-specific
BASE_CACHE_DIR = "cached_knowledge_bases"
os.makedirs(BASE_CACHE_DIR, exist_ok=True)

# --- Define URL lists for each Zoho product ---
zoho_books_urls = [
    "https://www.zoho.com/in/books/help/",
    "https://www.zoho.com/in/books/kb/",
    "https://www.zoho.com/books/api/v3/",
    "https://www.zoho.com/in/books/welcome-guide.html",
    "https://www.zoho.com/in/books/accounting-software-features/"
]

zoho_inventory_urls = [
    "https://www.zoho.com/in/inventory/help/",
    "https://www.zoho.com/in/inventory/kb/", # Added from user input
    "https://www.zoho.com/inventory/api/v1/" # Added from user input
]

zoho_payroll_urls = [
    "https://www.zoho.com/in/payroll/help/",
    "https://www.zoho.com/in/payroll/kb/" # Added from user input
]

zoho_creator_urls = [
    "https://www.zoho.com/creator/help/"
]

zoho_analytics_urls = [
    "https://www.zoho.com/analytics/help/overview.html",
    "https://www.zoho.com/analytics/help/",
    "https://www.zoho.com/analytics/api/v2/"
]

# NEW URL LISTS FOR ADDITIONAL PRODUCTS
zoho_people_urls = [
    "https://help.zoho.com/portal/en/kb/people/"
]

zoho_recruit_urls = [
    "https://help.zoho.com/portal/en/kb/recruit/"
]

zoho_crm_urls = [
    "https://help.zoho.com/portal/en/kb/crm/"
]

# --- Integrate OpenAI API Key directly into the code ---
OPENAI_API_KEY = ""
# STEP 3: QA Pipelines
device = 0 if torch.cuda.is_available() else "cpu"
print(f"Using device for local models: {'GPU (ID 0)' if isinstance(device, int) else 'CPU'}")

hf_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=512,
    truncation=True,
    device=device
)

# STEP 4: Text extraction utils
def _parse_html_to_readable_text(html_content):
    """Extracts readable text content from given HTML."""
    try:
        if not html_content or len(html_content.strip()) < 100:
            return "[Error: HTML content too short or empty for parsing]"
        readable_html = Document(html_content).summary()
        soup = BeautifulSoup(readable_html, 'html.parser')
        text = soup.get_text(separator="\n", strip=True)
        return text if text else "[Error: No readable text extracted from HTML]"
    except Exception as e:
        return f"[Error parsing HTML]: {e}"

def extract_text_from_file(file_path): # Changed to file_path from file object
    """Extracts text from a PDF or TXT file path."""
    if file_path.endswith(".pdf"):
        with open(file_path, "rb") as f:
            reader = PdfReader(f)
            return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# STEP 5: Crawling utility (still necessary to fetch content from URLs)
def crawl_single_url(base_url):
    """
    Crawls a single URL using Playwright.
    Returns a single (url, text_content) tuple.
    """
    if not isinstance(base_url, str) or not base_url.startswith("http"):
        return (base_url, f"[Error: Invalid URL format: {base_url}]")

    print(f"Processing: {base_url}")
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            page.goto(base_url, wait_until='load', timeout=60000)
            page.wait_for_timeout(2000) # Wait for any dynamic JS content
            html_content = page.content()
            extracted_text = _parse_html_to_readable_text(html_content)
            browser.close()
            return (base_url, extracted_text)
    except Exception as e:
        error_message = f"[Error crawling page]: {e}"
        print(f"   ‚ùå Error at {base_url}: {e}")
        return (base_url, error_message)

# STEP 6: Chart rendering
def generate_chart_from_csv_text(csv_text):
    try:
        # This function assumes CSV data is present directly in the context text.
        # It's an advanced feature and might need fine-tuning based on actual CSV formats.
        df = pd.read_csv(io.StringIO(csv_text))
        if df.empty or len(df.columns) < 2:
            return "<b>Chart Error:</b> Not enough data to generate chart."
        fig, ax = plt.subplots(figsize=(10, 5))
        df.set_index(df.columns[0]).plot(kind="bar", ax=ax)
        plt.title("Chart from Tabular Data", fontsize=16)
        plt.xlabel(df.columns[0], fontsize=12)
        plt.ylabel("Value", fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.legend(title="Metrics", bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        buf = io.BytesIO()
        plt.savefig(buf, format="png")
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.read()).decode("utf-8")
        return f'<img src="data:image/png;base64,{encoded}"/>'
    except Exception as e:
        return f"<b>Chart Error:</b> An unexpected error occurred: {e}"

# STEP 7: Context store and FAISS setup
# These will be populated based on the loaded product's knowledge base
global_context = {"text": ""} # Store combined raw text for current active KB
chunk_texts = [] # Store text chunks for retrieval for current active KB
vector_index = None # FAISS index for current active KB
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda' if torch.cuda.is_available() else 'cpu')

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size - overlap) if len(words[i:i+chunk_size]) > 20]

def build_faiss_index(text_chunks):
    global vector_index, chunk_texts
    chunk_texts = text_chunks # Update the global chunks for the currently loaded KB
    if not chunk_texts:
        vector_index = None
        return "No text found to build index."
    try:
        embeddings = embed_model.encode(chunk_texts, show_progress_bar=False)
        vector_index = faiss.IndexFlatL2(embeddings.shape[1])
        vector_index.add(embeddings)
        return f"Index built with {len(chunk_texts)} chunks."
    except Exception as e:
        vector_index = None
        return f"Error building index: {e}"

def retrieve_top_chunks(query, top_k=5):
    if not vector_index: return "No document content loaded or indexed for retrieval."
    query_vec = embed_model.encode([query])
    _, indices = vector_index.search(query_vec, top_k)
    return "\n---\n".join([chunk_texts[i] for i in indices[0]])

# STEP 8: QA logic
def ask_question(query, model_type):
    retrieved_context = retrieve_top_chunks(query)
    if "No document content loaded" in retrieved_context:
        return "I can't answer right now. Please ensure content is loaded and indexed correctly by clicking a 'Load' button."

    prompt = f"Using the following context, answer the question accurately. If the answer is not in the context, state that you cannot answer based on the provided information.\n\nContext:\n{retrieved_context}\n\nQuestion: {query}\n\nAnswer:"
    if model_type == "Hugging Face (Free)":
        result = hf_pipeline(prompt)
        return textwrap.fill(result[0]['generated_text'].strip(), width=100)
    elif model_type == "OpenAI (GPT)":
        if not OPENAI_API_KEY or OPENAI_API_KEY == "YOUR_OPENAI_API_KEY_HERE":
            return "‚ö†Ô∏è OpenAI API key is not configured. Please set it in the code."
        try:
            client = OpenAI(api_key=OPENAI_API_KEY)
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "system", "content": "You are a helpful assistant. Answer the question based *only* on the provided context."}, {"role": "user", "content": f"Context:\n{retrieved_context}\n\nQuestion: {query}"}],
                temperature=0.1
            )
            return textwrap.fill(response.choices[0].message.content.strip(), width=100)
        except Exception as e:
            return f"OpenAI Error: {e}"

# --- Function to re-index (crawl + process + save) data for a specific product ---
# Arguments now correctly match the inputs from gr.State and gr.File
def reindex_product_content(product_name: str, urls_to_crawl: list, uploaded_file_paths: list, progress=gr.Progress(track_tqdm=True)):
    global global_context, chunk_texts, vector_index

    # Define paths for this specific product's cache
    product_faiss_index = os.path.join(BASE_CACHE_DIR, f"{product_name}_faiss_index.bin")
    product_chunks_file = os.path.join(BASE_CACHE_DIR, f"{product_name}_chunks.json")

    all_raw_texts = []
    status = []

    status.append(f"--- Starting Re-Indexing for {product_name} ---")

    if urls_to_crawl:
        status.append(f"Crawling {len(urls_to_crawl)} URLs for {product_name}...")
        crawled_results = []
        MAX_CONCURRENT_CRAWLS = 5 # Adjust as needed for your system
        # Ensure progress.tqdm is used correctly
        with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_CRAWLS) as executor:
            # Use list() around executor.map to ensure tqdm can iterate over it
            for result in progress.tqdm(list(executor.map(crawl_single_url, urls_to_crawl)), total=len(urls_to_crawl), desc=f"Crawling {product_name} URLs"):
                crawled_results.append(result)

        successful_crawls = 0
        for url, text_content in crawled_results:
            if not text_content.startswith("[Error"):
                all_raw_texts.append(f"--- Document Source: {url} ---\n{text_content}\n")
                successful_crawls += 1
        status.append(f"Successfully extracted text from {successful_crawls} out of {len(crawled_results)} URLs for {product_name}.")
    else:
        status.append(f"No URLs provided for {product_name} re-crawling.")

    if uploaded_file_paths:
        status.append(f"Processing {len(uploaded_file_paths)} uploaded files...")
        for file_obj in uploaded_file_paths: # file_obj here is a Gradio File object
            try:
                extracted_text = extract_text_from_file(file_obj.name) # Pass the file path
                all_raw_texts.append(f"--- Document Source: {file_obj.name} ---\n{extracted_text}\n")
                status.append(f"Successfully extracted text from: {file_obj.name}")
            except Exception as e:
                status.append(f"Error extracting text from {file_obj.name}: {e}")

    combined_text = "\n\n".join(all_raw_texts)
    global_context["text"] = combined_text # Update global raw text for the currently active KB

    if not combined_text:
        status.append("No content to process. Index will not be built/saved.")
        # Clear current active KB if no content was loaded
        vector_index = None
        chunk_texts = []
        global_context["text"] = ""
        return "\n".join(status)

    processed_chunks = chunk_text(combined_text)

    if not processed_chunks:
        status.append("No valid chunks created from content. Index will not be built/saved.")
        # Clear current active KB if no valid chunks
        vector_index = None
        chunk_texts = []
        global_context["text"] = ""
        return "\n".join(status)

    # Build index and save it for this product
    index_status_message = build_faiss_index(processed_chunks)
    status.append(index_status_message)

    try:
        faiss.write_index(vector_index, product_faiss_index)
        with open(product_chunks_file, "w", encoding="utf-8") as f:
            json.dump(chunk_texts, f) # Save the globally updated chunk_texts
        status.append(f"‚úÖ FAISS index and chunks saved for {product_name} to '{product_faiss_index}' and '{product_chunks_file}'.")
    except Exception as e:
        status.append(f"‚ùå Error saving FAISS index or chunks for {product_name}: {e}")

    status.append(f"--- Re-Indexing Process Complete for {product_name} ---")
    status.append(f"Knowledge base for {product_name} is now active and ready for questions.")
    return "\n".join(status)

# --- Function to load a specific pre-built index at startup or after re-indexing ---
def load_specific_index_from_disk(product_name):
    global vector_index, chunk_texts, global_context
    status_messages = []

    product_faiss_index = os.path.join(BASE_CACHE_DIR, f"{product_name}_faiss_index.bin")
    product_chunks_file = os.path.join(BASE_CACHE_DIR, f"{product_name}_chunks.json")

    status_messages.append(f"Attempting to load knowledge base for {product_name}...")

    if os.path.exists(product_faiss_index) and os.path.exists(product_chunks_file):
        try:
            vector_index = faiss.read_index(product_faiss_index)
            with open(product_chunks_file, "r", encoding="utf-8") as f:
                chunk_texts = json.load(f)

            global_context["text"] = " ".join(chunk_texts) # Approximation for chart if needed

            status_messages.append(f"‚úÖ Loaded FAISS index from '{product_faiss_index}' with {len(chunk_texts)} chunks.")
            status_messages.append(f"System ready to answer questions about {product_name}.")
            status_messages.append(f"Total characters in {product_name} index: {len(global_context['text']):,}")
        except Exception as e:
            status_messages.append(f"‚ùå Error loading pre-built index for {product_name}: {e}. Index will be empty.")
            vector_index = None
            chunk_texts = []
            global_context["text"] = ""
    else:
        status_messages.append(f"‚ö†Ô∏è Pre-built index files for {product_name} not found.")
        status_messages.append(f"Please click 'Load/Refresh {product_name} Content' to build its knowledge base.")
        vector_index = None
        chunk_texts = []
        global_context["text"] = ""

    return "\n".join(status_messages)


# --- Initial load at application startup ---
# We won't load any specific KB at startup by default,
# the user will pick one via a button.
# The initial status will prompt the user to load a KB.
initial_app_status = "Welcome! Please select a Zoho product to load its knowledge base by clicking a button."


# STEP 10: Chat handler
def chatbot_response(user_query, chat_history, model_choice):
    chat_history = chat_history or []
    bot_message_content = ""
    if not vector_index: # Check for the FAISS index existence
        bot_message_content = "‚ö†Ô∏è The knowledge base is not loaded. Please click a 'Load/Refresh' button for a Zoho product to build/load it."
    elif user_query.lower().strip() == "chart:":
        if global_context["text"]:
            bot_message_content = generate_chart_from_csv_text(global_context["text"])
        else:
            bot_message_content = "‚ö†Ô∏è No combined text available for chart generation. Load content first."
    else:
        bot_message_content = ask_question(user_query, model_choice)

    chat_history.append([user_query, bot_message_content])
    return chat_history, "" # Return updated history and empty string to clear the input box

# STEP 11: Gradio UI with specific buttons
with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 1200px !important;}") as demo:
    gr.Markdown("# üìö Zoho Product Intelli-Chat üöÄ")
    gr.Markdown("Select a Zoho product to load its knowledge base and ask questions.")

    with gr.Row(variant="panel"):
        with gr.Column(scale=1, min_width=380):
            gr.Markdown("### ‚öôÔ∏è Controls")

            gr.Markdown("#### Load/Refresh Knowledge Bases")
            gr.Markdown("<sub>Click a button to crawl and index content for a specific Zoho product. This will save the knowledge base for faster loading next time.</sub>")

            with gr.Row():
                load_books_button = gr.Button("üìö Load/Refresh Zoho Books", variant="primary")
                load_inventory_button = gr.Button("üì¶ Load/Refresh Zoho Inventory", variant="secondary")
            with gr.Row():
                load_payroll_button = gr.Button("üí∞ Load/Refresh Zoho Payroll", variant="secondary")
                load_creator_button = gr.Button("üõ†Ô∏è Load/Refresh Zoho Creator", variant="secondary")
            with gr.Row():
                load_analytics_button = gr.Button("üìä Load/Refresh Zoho Analytics", variant="secondary")

            # NEW BUTTONS FOR ADDITIONAL PRODUCTS
            with gr.Row():
                load_people_button = gr.Button("üßë‚Äçü§ù‚Äçüßë Load/Refresh Zoho People", variant="secondary")
                load_recruit_button = gr.Button("üìù Load/Refresh Zoho Recruit", variant="secondary")
            with gr.Row():
                load_crm_button = gr.Button("üíº Load/Refresh Zoho CRM", variant="secondary")


            with gr.Accordion("üìÇ Upload Additional Files (Optional)", open=False):
                file_input = gr.File(
                    file_types=[".pdf", ".txt"],
                    file_count="multiple",
                    label="Upload PDF/TXT Documents (for current selected KB)"
                )

            # Initial status from startup
            load_status = gr.Textbox(label="Status & Log", interactive=False, lines=8, value=initial_app_status)

            with gr.Accordion("üß† Choose Your AI Model", open=True):
                model_choice = gr.Dropdown(
                    ["Hugging Face (Free)", "OpenAI (GPT)"],
                    label="Select Generative Model",
                    value="Hugging Face (Free)"
                )

            clear_button = gr.Button("üßπ Clear Chat & Status", variant="secondary")

        with gr.Column(scale=2):
            gr.Markdown("### üí¨ Your Conversation")
            chatbot = gr.Chatbot(label="Chat History", height=600)
            with gr.Row():
                question_input = gr.Textbox(show_label=False, placeholder="Ask a question...", scale=8)
                submit_button = gr.Button("Ask", variant="primary", scale=1)

    # --- Event Handlers for product-specific buttons ---
    # The 'progress' object will be automatically injected by Gradio because the fn signature expects it.
    # No need to put gr.Progress() in the inputs list.

    load_books_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho Books"), # Passes the string "Zoho Books" to 'product_name'
            gr.State(zoho_books_urls), # Passes the list zoho_books_urls to 'urls_to_crawl'
            file_input # Passes the uploaded files to 'uploaded_file_paths'
            # No gr.Progress() here!
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho Books")],
        outputs=[load_status]
    )

    load_inventory_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho Inventory"),
            gr.State(zoho_inventory_urls), # Now includes new Inventory URLs
            file_input
            # No gr.Progress() here!
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho Inventory")],
        outputs=[load_status]
    )

    load_payroll_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho Payroll"),
            gr.State(zoho_payroll_urls), # Now includes new Payroll URLs
            file_input
            # No gr.Progress() here!
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho Payroll")],
        outputs=[load_status]
    )

    load_creator_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho Creator"),
            gr.State(zoho_creator_urls),
            file_input
            # No gr.Progress() here!
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho Creator")],
        outputs=[load_status]
    )

    load_analytics_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho Analytics"),
            gr.State(zoho_analytics_urls),
            file_input
            # No gr.Progress() here!
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho Analytics")],
        outputs=[load_status]
    )

    # NEW EVENT HANDLERS FOR ADDITIONAL PRODUCTS
    load_people_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho People"),
            gr.State(zoho_people_urls),
            file_input
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho People")],
        outputs=[load_status]
    )

    load_recruit_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho Recruit"),
            gr.State(zoho_recruit_urls),
            file_input
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho Recruit")],
        outputs=[load_status]
    )

    load_crm_button.click(
        fn=reindex_product_content,
        inputs=[
            gr.State("Zoho CRM"),
            gr.State(zoho_crm_urls),
            file_input
        ],
        outputs=[load_status]
    ).then(
        fn=load_specific_index_from_disk,
        inputs=[gr.State("Zoho CRM")],
        outputs=[load_status]
    )

    # Corrected: Use .submit() for Textbox and .click() for Button
    question_input.submit(
        fn=chatbot_response,
        inputs=[question_input, chatbot, model_choice],
        outputs=[chatbot, question_input],
    )

    submit_button.click(
        fn=chatbot_response,
        inputs=[question_input, chatbot, model_choice],
        outputs=[chatbot, question_input],
    )

    clear_button.click(lambda: ([], "", initial_app_status), None, [chatbot, question_input, load_status])

demo.launch(debug=True, max_threads=10) # Added inbrowser=True for convenience during local development