# Step 1: Environment Setup

## General setup

In [1]:
!pip install google-generativeai streamlit faiss-cpu python-docx tiktoken PyMuPDF pyngrok pandas docx2txt \
  "llama-index>=0.10" llama-index-llms-google-genai llama-index-embeddings-google-genai llama-index-vector-stores-faiss \
  -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
import os
from google.colab import userdata
import google.generativeai as genai

drive.mount('/content/drive')
PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Internal_Chatbot' # Choose your project directory name
os.chdir(PROJECT_DIR) # Navigate to your project directory (optional, but good for relative paths later)
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Colab Notebooks/MR_Chatbot


In [3]:
# --- Retrieve API key: Gemini version ---
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    if GEMINI_API_KEY is None:
        raise ValueError("Gemini API key not found in Colab Secrets. Please ensure it's added correctly with the name GEMINI_API_KEY.")
    genai.configure(api_key=GEMINI_API_KEY)
    print("Gemini API Key configured successfully.")
except Exception as e:
    print(f"Error configuring Gemini API Key: {e}")
    print("Please ensure you have added your GEMINI_API_KEY to Colab Secrets (Key icon on the left).")

if 'GEMINI_API_KEY' in locals() and GEMINI_API_KEY: # Ensure GEMINI_API_KEY was loaded
    os.environ['GOOGLE_API_KEY'] = GEMINI_API_KEY
    print("✅ GOOGLE_API_KEY environment variable set.")
else:
    print("❌ GEMINI_API_KEY not found, so GOOGLE_API_KEY environment variable was not set.")


Gemini API Key configured successfully.
✅ GOOGLE_API_KEY environment variable set.


# Step 2: Load Knowledge Base (rerun when docs/data have changed)

In [None]:
# Load Documents
import os
import faiss # For creating the FAISS index object


# Ensure the vector store directory exists
if not os.path.exists(VECTOR_STORE_DIR):
    os.makedirs(VECTOR_STORE_DIR)
    print(f"Created directory: {VECTOR_STORE_DIR}")

print(f"Data directory: {DATA_DIR}")
print(f"Vector store directory: {VECTOR_STORE_DIR}")

# Load Documents using SimpleDirectoryReader
print("\n--- Loading documents ---")
try:
    if not os.listdir(DATA_DIR): # Check if the data directory is empty
        print(f"⚠️ The data directory '{DATA_DIR}' is empty.")
        print("   Please upload your sample documents (PDF, DOCX, TXT) to this folder on Google Drive.")
        documents = [] # Initialize as empty list
    else:
        print(f"Reading files from: {DATA_DIR}")
        # SimpleDirectoryReader will try to read all supported files in the directory
        documents = SimpleDirectoryReader(DATA_DIR).load_data()
        if documents:
            print(f"✅ Successfully loaded {len(documents)} document(s).")
            for doc in documents:
                # Print the filename and a snippet of the text
                print(f"  - Loaded: {doc.metadata.get('file_name', 'Unknown filename')}, Snippet: '{doc.text[:100].strip()}...'")
        else:
            print("⚠️ No documents were loaded. Check file types and content in the data directory.")
except Exception as e:
    print(f"❌ Error loading documents: {e}")
    documents = [] # Ensure documents is defined even if loading fails

# Quick check
if not documents:
    print("\n🔴 No documents loaded. Further steps in Phase 2 depend on having documents.")
    print("   Please check your data directory and ensure files are uploaded correctly.")
else:
    print("\n✅ Document loading step complete.")

# Embed, Index, Persist Doc
print(f"--- Starting Index Construction & Persistence ---")

if 'documents' not in locals() or not documents:
    print("🔴 No documents loaded (the 'documents' variable is empty or not defined).")
    print("   Cannot build index. Please re-run the document loading cell (Cell 2.1) successfully first.")
else:
    try:
        # 1. Determine Embedding Dimension
        #    We need this for initializing the FAISS index.
        print("Determining embedding dimension...")
        if Settings.embed_model:
            # This makes a quick API call to get a sample embedding's length
            sample_embedding = Settings.embed_model.get_text_embedding("test")
            d = len(sample_embedding)
            print(f"✅ Detected embedding dimension: {d}")
        else:
            # This should not happen if Cell 2.1 ran correctly
            print("⚠️ Embedding model not found in LlamaIndex Settings. Assuming dimension 768 for Gemini.")
            d = 768

        # 2. Initialize FAISS Index
        #    IndexFlatL2 is a common choice for exact, brute-force similarity search.
        faiss_index = faiss.IndexFlatL2(d)
        print(f"FAISS index initialized with dimension {d}.")

        # 3. Create LlamaIndex FaissVectorStore
        #    This wraps our FAISS index for use with LlamaIndex.
        vector_store = FaissVectorStore(faiss_index=faiss_index)
        print("FaissVectorStore created.")

        # 4. Create StorageContext
        #    This tells LlamaIndex to use our FaissVectorStore for storing embeddings.
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        print("StorageContext created using our FaissVectorStore.")

        # 5. Build the VectorStoreIndex
        #    This is the main step:
        #    - Documents are chunked into nodes.
        #    - Each node's text is embedded using Settings.embed_model (Gemini).
        #    - Embeddings and nodes are stored via the StorageContext (in our FAISS store).
        #    This can take time depending on document size/count and API latency for embeddings.
        print("⏳ Building VectorStoreIndex... This might take a few minutes...")
        index = VectorStoreIndex.from_documents(
            documents, # Your loaded documents from Cell 2.1
            storage_context=storage_context,
            show_progress=True
        )
        print("✅ VectorStoreIndex built successfully.")

        # 6. Persist the index to disk
        #    This saves the LlamaIndex structures (docstore, index_store) and tells
        #    the FaissVectorStore to persist its own data (the FAISS index itself).
        print(f"💾 Persisting index to: {VECTOR_STORE_DIR} ...")
        index.storage_context.persist(persist_dir=VECTOR_STORE_DIR)
        print(f"✅ Index persisted successfully to {VECTOR_STORE_DIR}")

    except Exception as e_build:
        print(f"❌ An error occurred during index construction or persistence: {e_build}")
        import traceback
        traceback.print_exc()

print("\n--- Phase 2: Indexing (Build & Store) Steps Attempted ---")

Data directory: /content/drive/MyDrive/Colab Notebooks/MR_Chatbot/data
Vector store directory: /content/drive/MyDrive/Colab Notebooks/MR_Chatbot/vectorstore

--- Loading documents ---
Reading files from: /content/drive/MyDrive/Colab Notebooks/MR_Chatbot/data
✅ Successfully loaded 5 document(s).
  - Loaded: ID1_PaediatricSurgeon_ShanghaiChildrensMedicalCentre_ENG_Transcript.docx, Snippet: 'Name: CJ Xie 谢晨捷

Profession: Paediatric Surgeon

Interview number: ID1

Date: 16/10/2024

City: Sha...'
  - Loaded: ID2_Haematology_ShanghaiChildrensMedicalCentre_ENG_Transcript.docx, Snippet: 'Name: CC Chen

Profession: Paediatric Haematology Oncologists 

Interview number: ID2 

Date: 18/10/...'
  - Loaded: ID3_Nurse_ShengjingHospital_ENG_Transcript.docx, Snippet: 'Name: HF Qu

Profession: Paediatric Haematology Nurse 

Interview number: ID3 

Date: 22/10/2024 

C...'
  - Loaded: ID4_Haematology_ShenjingHospital_ENG_Transcript.docx, Snippet: 'Name: LC Hao

Profession: Paediatric Haematology Oncolog

# Step 3: Setup app.py - the app.py on github should contain the following code (rerun when the following code is changed to rewrite app.py file)

In [None]:
# The following code when ran will be written into app.py in the project folder

%%writefile '/content/drive/MyDrive/Colab Notebooks/Internal_Chatbot/app.py'
# Paste the entire Streamlit app code block below this line

import streamlit as st
import os
import faiss # For FaissVectorStore loading
from llama_index.core import Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.llms.google_genai import GoogleGenAI as GeminiLLM
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding as GeminiEmbedding
import google.generativeai as genai # For API key configuration

# --- Configuration ---
PROJECT_BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Internal_Chatbot'
VECTOR_STORE_DIR = os.path.join(PROJECT_BASE_PATH, "vectorstore")

# --- LlamaIndex Setup (Cached by Streamlit) ---
@st.cache_resource(show_spinner="Initializing AI Advisor and loading knowledge base...")
def load_and_setup_ai_advisor():
    # 1. Configure Google Gemini API Key
      #    Streamlit apps run in their own process. We rely on the GOOGLE_API_KEY
      #    environment variable being set in the environment where Streamlit runs.
      #    When running from Colab with `!streamlit run`, it *should* inherit.
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        st.error("🔴 GOOGLE_API_KEY environment variable not found! Please ensure it's set in your Colab session before running Streamlit.")
        st.stop()
        return None, None

    try:
        genai.configure(api_key=api_key)
    except Exception as e:
        st.error(f"🔴 Error configuring Google GenAI with API key: {e}")
        st.stop()
        return None, None

    # 2. Configure LlamaIndex Global Settings (LLM and Embed Model)
    try:
        Settings.llm = GeminiLLM(model="models/gemini-1.5-flash-latest")
        Settings.embed_model = GeminiEmbedding(model_name="models/text-embedding-004")
        st.sidebar.success(f"LLM: {Settings.llm.model}\nEmbed: {Settings.embed_model.model_name}")
    except Exception as e:
        st.error(f"🔴 Error configuring LlamaIndex Settings (LLM/Embeddings): {e}")
        st.stop()
        return None, None

    # 3. Load the Persisted Index
    try:
        vector_store = FaissVectorStore.from_persist_dir(persist_dir=VECTOR_STORE_DIR)
        storage_context = StorageContext.from_defaults(
            vector_store=vector_store,
            persist_dir=VECTOR_STORE_DIR
        )
        index = load_index_from_storage(storage_context = storage_context)
        # Get a chat engine
        # You can adjust similarity_top_k as needed. Higher means more context, potentially slower/costlier.
        chat_engine_obj = index.as_chat_engine(
            chat_mode="condense_question",
            verbose=True,
            similarity_top_k=10,
            )
        st.success("💡 AI Advisor initialized and knowledge base loaded!")
        return index, chat_engine_obj
    except Exception as e:
        st.error(f"🔴 Error loading persisted index: {e}")
        st.exception(e) # Show full traceback in Streamlit app for debugging
        st.stop()
        return None, None


# --- Streamlit App UI ---
st.set_page_config(page_title="Market AI Advisor", page_icon="💡", layout="wide")
st.title("💡 Market AI Advisor")
st.caption(f"Powered by LlamaIndex and Google Gemini. Knowledge base last updated based on files in {VECTOR_STORE_DIR}")

# Load index and chat engine (cached)
# This function will only run once unless its code changes or cache is cleared.
loaded_index, chat_engine = load_and_setup_ai_advisor()

if loaded_index and chat_engine:
    # Initialize chat history
    if "messages" not in st.session_state:
        st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I help you with your market research insights today?"}]

    # Display prior chat messages
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Get new user input
    if prompt := st.chat_input("Ask your question..."):
        # Add user message to session state and display it
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)

        # Get assistant response
        with st.chat_message("assistant"):
            message_placeholder = st.empty()
            message_placeholder.markdown("Thinking...")
            try:
                response_obj = chat_engine.chat(prompt)
                response_text = str(response_obj)

                # Optionally display source nodes
                source_nodes_md = "\n\n---\n**Retrieved Sources:**\n"
                if response_obj.source_nodes:
                    for i, node in enumerate(response_obj.source_nodes):
                        file_name = node.metadata.get('file_name', 'N/A') if node.metadata else 'N/A'
                        source_nodes_md += f"{i+1}. **File:** {file_name} (Score: {node.score:.2f})\n"
                        source_nodes_md += f"   *Snippet:* {node.text[:150].strip().replace(chr(10), ' ')}...\n"
                    response_text += source_nodes_md
                else:
                    response_text += "\n\n(No specific source text segments retrieved for this query)"

                message_placeholder.markdown(response_text)
                st.session_state.messages.append({"role": "assistant", "content": response_text})

            except Exception as e:
                error_message = f"Sorry, an error occurred while processing your question: {e}"
                st.error(error_message)
                st.session_state.messages.append({"role": "assistant", "content": error_message})
else:
    st.info("AI Advisor is not ready. Please check for error messages above.")

Overwriting /content/drive/MyDrive/Colab Notebooks/MR_Chatbot/app.py


# Step 4: Launch Streamlit Application

## Web-based Chatbot

In [None]:
from pyngrok import ngrok, conf
import os
import time
from datetime import datetime
import pytz

In [None]:
NGROK_AUTHTOKEN = userdata.get('NGROK_AUTHTOKEN')
ngrok.set_auth_token(NGROK_AUTHTOKEN)



In [None]:
# Requirements
  # Ensure GOOGLE_API_KEY setup, i.e. it is in the environment for the Streamlit process
  # Set up ngrok configuration (optional, but can be useful)

ngrok.kill() # Terminate any existing ngrok tunnels
app_file_path = '/content/drive/MyDrive/Colab Notebooks/Internal_Chatbot/app.py'
public_url = ngrok.connect(8501) # Start ngrok tunnel to Streamlit's default port 8501

# Chatbot log will be saved in the following folder with timestamp (swiss time)
timestamp = datetime.now(pytz.timezone('Europe/Zurich') ).strftime("%Y%m%d_%H%M%S")
LOG_DIR = '/content/drive/MyDrive/Colab Notebooks/Internal_Chatbot/chatbot logs'
log_file_path = os.path.join(LOG_DIR, f"{timestamp}_streamlit.log")

# Run Streamlit
  # The `nohup` and `&` run it in the background.
  # We also pipe output to a log file for easier debugging if Streamlit has issues starting.
print(f"Starting Internal_Chatbot")
!nohup streamlit run "{app_file_path}" --server.port 8501 &> "{log_file_path}" &
print("Loading... Please wait...")
time.sleep(5)
print("\nStreamlit app process has been started (or attempted).")
print(f"Access your app at: {public_url}")


Starting MR_Chatbot
Loading... Please wait...

Streamlit app process has been started (or attempted).
Access your app at: NgrokTunnel: "https://1174-107-167-182-207.ngrok-free.app" -> "http://localhost:8501"


# Step 5: To stop and disconnect Chatbot

In [None]:
print("Attempting to disconnect all ngrok tunnels and kill ngrok process...")
try:
    tunnels = ngrok.get_tunnels()
    for tunnel in tunnels:
        ngrok.disconnect(tunnel.public_url)
        print(f"Disconnected tunnel: {tunnel.public_url}")
    ngrok.kill()
    print("ngrok process terminated.")
except Exception as e:
    print(f"Error stopping ngrok (it might not have been running): {e}")


!pkill -f streamlit
print("Attempted to stop any existing Streamlit process(es).")



Attempting to disconnect all ngrok tunnels and kill ngrok process...
Disconnected tunnel: https://f6c4-107-167-182-207.ngrok-free.app
ngrok process terminated.
Attempted to stop any existing Streamlit process(es).


# Extra Section: Configure LlamaIndex (For tinkering purposes in notebook, not needed for web-based chatbot)

## General setup

In [None]:
from llama_index.llms.google_genai import GoogleGenAI as GeminiLLM # Using an alias for clarity
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding as GeminiEmbedding # Using an alias
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import Settings, VectorStoreIndex, StorageContext, load_index_from_storage, SimpleDirectoryReader

DATA_DIR = os.path.join(PROJECT_DIR, "data")
VECTOR_STORE_DIR = os.path.join(PROJECT_DIR, "vectorstore")

# 1. Configure LlamaIndex to use Gemini
#    Ensure your GEMINI_API_KEY is already configured in the genai library
#    from the setup cell. If not, you might need to re-run that part.
print("\n--- Configuring LlamaIndex with Gemini ---")
try:
    # Set up the LLM (for response generation, summarization, etc.) and embedding model (this model is for Gemini)
    Settings.llm = GeminiLLM(model="models/gemini-1.5-flash-latest")
    print("LLM configured with Gemini.")

    Settings.embed_model = GeminiEmbedding(model_name="models/text-embedding-004") # Or "models/embedding-001"
    print("Embedding model configured with GeminiEmbedding.")
    print(f"Using embedding model: {Settings.embed_model.model_name}")

except Exception as e:
    print(f"❌ Error configuring LlamaIndex with Gemini: {e}")
    print("   Ensure your GEMINI_API_KEY and GOOGLE_API_KEY env var was set in your setup cell..")
    # You might want to stop execution here if configuration fails
    raise


--- Configuring LlamaIndex with Gemini ---
LLM configured with Gemini.
Embedding model configured with GeminiEmbedding.
Using embedding model: models/text-embedding-004


## Load Persisted Index  

In [None]:
 # --- Optional: Test Loading the Persisted Index ---
print("\n---  Attempting to load the persisted index  ---")
# Note: Settings.llm and Settings.embed_model should still be configured globally from Cell 2.1 for the query engine to work correctly later.
try:
    print(f"Loading index from: {VECTOR_STORE_DIR}")
    vector_store = FaissVectorStore.from_persist_dir(persist_dir=VECTOR_STORE_DIR)
    loaded_storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir=VECTOR_STORE_DIR)
    loaded_index = load_index_from_storage(storage_context = loaded_storage_context)
    print("✅ Index loaded successfully from disk for testing.")

except Exception as e_load:
    print(f"❌ Error loading persisted index for testing: {e_load}")


---  Attempting to load the persisted index  ---
Loading index from: /content/drive/MyDrive/Colab Notebooks/MR_Chatbot/vectorstore
✅ Index loaded successfully from disk for testing.


## chatengine test loaded persisted index

In [None]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_question", streaming=True
)
response_stream = chat_engine.stream_chat("Who are the moderator and interviewees")

In [None]:
response_stream.print_response_stream()

The moderator is Steven.  The interviewees are ID4_Nurse and ID4_HaemOnco.


In [None]:
response_stream = chat_engine.stream_chat("Who else?")

In [None]:
response_stream.print_response_stream()

This question cannot be answered from the given source.


In [None]:
chat_engine.reset()

## queryengine test of loaded persisted index

In [None]:
from IPython.display import Markdown, display

print("\n--- Attempting to query loaded persisted index  ---")
try:
    # Perform a quick test query if persisted index were loaded
    if loaded_index:
        print("   Performing a quick test query on the loaded index...")
        query_engine = loaded_index.as_query_engine(similarity_top_k=1)
        response = query_engine.query("Who are the moderators and interviewees?")
        display(Markdown(f"<b>{response}</b>"))
    else:
        print("   Skipping test query as no documents were originally loaded to build the index.")
except Exception as e_load:
    print(f"❌ Error loading persisted index for testing: {e_load}")


--- Attempting to query loaded persisted index  ---
   Performing a quick test query on the loaded index...


<b>The interviewer is Steven.  The interviewee is a pediatric surgeon from Shanghai Children's Medical Centre.
</b>