# Expert Knowledge Worker

### A question answering agent that is an expert knowledge worker
### To be used by employees of Insurellm, an Insurance Tech company
### The agent needs to be accurate and the solution should be low cost.

This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

This first implementation will use a simple, brute-force type of RAG..

## 1. Importing Libraries

In [1]:
#!pip install --upgrade langchain
#!pip install -U langchain-community
#!pip install --upgrade transformers
#!pip install -U langchain-huggingface
#!pip install cryptography
#!pip install colorlog
#!pip install hf_xet

In [2]:
# imports

import os
import subprocess
import requests
import time
import queue
import html
import psutil
import GPUtil
import winreg
import gradio as gr
import matplotlib.pyplot as plt
import matplotlib as mpl
from llm_chains import CustomLangChain, VectorEmbedding, Talker, Whisper, OllamaService, setup_logger
from dotenv import load_dotenv
from huggingface_hub import login
from openai import OpenAI
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play
from ansi2html import Ansi2HTMLConverter
from collections import deque


load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
#login(os.environ['HF_TOKEN'], add_to_git_credential=True)
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.environ['HF_TOKEN']

openai_api_key = os.getenv('OPENAI_API_KEY')
MODEL_GPT = "gpt-4o-mini"

In [3]:
# Set device
device = "cuda" # if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

if device == "cuda":
    !nvidia-smi

import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

Device: cuda
Fri Jul 25 22:36:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.88                 Driver Version: 576.88         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   56C    P8             11W /  200W |     907MiB /  12282MiB |      6%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                   

## 2. Creating/Loading Vectorstore

In [4]:
# Path containing the knowledge database
FOLDER_PATH = r"D:\Technical_Documents\Papers\technical_documents"

# Target vectorstore path
DB_NAME = "vector_db_1024"
EMBEDDING_MODEL = "huggingface_1024"

# Instantiate vector embedding
vector_embedding = VectorEmbedding(    
    device=device,
    log_level='info',
)

if os.path.isdir(DB_NAME):
    # Exeture this code if the vectorstore has already been created
    vectorstore = vector_embedding.load_vectorstore(
        vector_db_name = DB_NAME,
        embedding_model = EMBEDDING_MODEL,
        )
else:
    # Execute this code to generate the vectorstore, the database with the vectorized documents
    vector_embedding.split_into_chunks(
        document_path = FOLDER_PATH,
        chunk_size=1000,
        chunk_overlap=300,
        clean_text_flag=True,
    )
    
    vectorstore = vector_embedding.create_vectorstore(
        vector_db_name = DB_NAME,
        embedding_model = EMBEDDING_MODEL,
    )

<span style="color:forestgreen; font-weight:bold;">[INFO]</span> Set cuda device.
<span style="color:forestgreen; font-weight:bold;">[INFO]</span> Using Hugging Face model: BAAI/bge-large-en.
<span style="color:forestgreen; font-weight:bold;">[INFO]</span> Loading vectorstore...
<span style="color:forestgreen; font-weight:bold;">[INFO]</span> Vectorstore loaded with 100630 documents and 100,630 vectors with 1,024 dimensions.


* Using Hugging Face model: BAAI/bge-large-en
* Processed documents: >26,000
* Total number of chunks: 100,630 of 1,024 dimensions

## 3. Visualizing the Database

In [5]:
# This method projects the vectorstore into 2D and clusters the samples using K-means.
#vector_embedding.visualize_2d_cluster(
#        n_clusters=20,
#        pt_size=5,        
#        cmap='jet',
#        seed=42)

In [6]:
# This methos also project the vectorstore into 2D and classifies the samples according to the folder structure given in the knowledge database
#vector_embedding.visualize_2d(
#    figsize=(11, 7),
#    cmap='hsv',
#    background='white',
#    pt_size=5,
#    marker='o',
#    legend_fontsize='small',
#    legend_loc='outside right center',
#    seed=42)

## 4. Creating Agents

In [7]:

# Instatiate agents
openai_client = OpenAI()
talker_agent = None
whisper_agent = None
log_queue = queue.Queue()
logger = setup_logger(log_queue=log_queue, level="info")
conv = Ansi2HTMLConverter()
cpu_history = deque(maxlen=60)
gpu_history = deque(maxlen=60)
#timestamps = deque(maxlen=60)
window_size = 30  # seconds (or data points)
cpu_history = deque([0]*window_size, maxlen=window_size)
gpu_history = deque([0]*window_size, maxlen=window_size)
time_history = deque(range(-window_size + 1, 1), maxlen=window_size)

def get_talker_agent():
    global talker_agent
    if talker_agent is None:
        talker_agent = Talker(openai_client)
    return talker_agent

def get_whisper_agent():
    global whisper_agent
    if whisper_agent is None:
        whisper_agent = Whisper(openai_client)
    return whisper_agent

def handle_audio(audio_path):
    whisper = get_whisper_agent()
    return whisper.transcribe(audio_path)

def handle_speech(text, voice):
    talker = get_talker_agent()
    talker.voice = voice  # change voice dynamically
    talker.speak(text)

def initialize_chain(model, tone, mode, lang_chain_state):

    # Build LangChain
    if (
        lang_chain_state is None
        or lang_chain_state["model"] != model
        or lang_chain_state["tone"] != tone
        or lang_chain_state["mode"] != mode
    ):
        chain = CustomLangChain(
            openai_client=openai_client,
            model_name=model,
            tone=tone,
            mode=mode,
            top_docs=15,
            vectorstore=vectorstore,
            log_level='info',
            logger=logger,
        )
        return {"chain": chain, "model": model, "tone": tone, "mode": mode}
    return lang_chain_state

# User's question
def question_fn(question, history):

    # Initialize history if None
    history = history or []

    # Append new messages
    history.append({"role": "user", "content": question})

    return history, history, ""

# This function will be triggered by the chat interface
def answer_fn(history, model, tone, mode, lang_chain_state):

    # History is empty or malformed
    if not history or not isinstance(history, list) or "content" not in history[-1]: 
        return history or [], "Sorry, I couldn't find your question.", lang_chain_state
    question = history[-1]["content"]
    lang_chain_state = initialize_chain(model, tone, mode, lang_chain_state)    
    lang_chain = lang_chain_state["chain"]
    answer = lang_chain.answer_question(question, history)
    
    # Append new messages
    history.append({"role": "assistant", "content": answer})  #history.append({"role": "user", "content": question})

    return history, answer, lang_chain_state

# Other functions

def clear_chat():
    return [], [], "", None # Also reset the LangChain object

def reset_mic_input():
    mic = gr.Audio(value=None, label="🎙️ Speak", type="filepath", interactive=True, sources=['microphone']) 
    return mic

def toggle_audio(audio_flag):
    new_state = not audio_flag
    button_label = "🔈 Enable Audio" if not new_state else "🔇 Disable Audio"
    return (
        gr.update(interactive=new_state),  # mic_input
        button_label,
        new_state
    )

def detect_llms(include_openai=True):
    import re
    models = []

    # Detect local Ollama models
    ollama_service = OllamaService(logger=logger)
    ollama_service.activate()

    x = !ollama list
    x = list(x)
    x = [re.split(r'\s{2,}', row.strip()) for row in x]
    ollama_models = [row[0] for row in x if row][1:]  # Skip header
    models.extend([(model, model) for model in ollama_models])

    # Add OpenAI models manually
    if include_openai:
        openai_models = [            
            ("gpt-4o", "gpt-4o"),
            ("gpt-4o-mini", "gpt-4o-mini"),            
        ]
        models.extend(openai_models)

    return models

def stream_logs():
    buffer = ""
    while True:
        try:
            while True:
                log = log_queue.get_nowait()
                #log = conv.convert(log, full=False)
                buffer += log + "<br>" #"\n"
        except queue.Empty:
            pass
        #yield f"<pre>{buffer}</pre>"        
        yield f"<pre style='font-family: monospace;'>{buffer}</pre>"
        time.sleep(1)

def is_dark_mode_enabled():
    try:
        key = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
            r"Software\Microsoft\Windows\CurrentVersion\Themes\Personalize")
        value, _ = winreg.QueryValueEx(key, "AppsUseLightTheme")
        return value == 0  # 0 = dark mode
    except:
        return False  # default fallback
    
def get_stats():
    fontsize = 14
    cpu = psutil.cpu_percent()
    gpus = GPUtil.getGPUs()
    gpu = gpus[0].load * 100 if gpus else 0

    # FIFO update
    cpu_history.append(cpu)
    gpu_history.append(gpu)
    time_history.append(time_history[-1] + 1)

    # Keep same window size for all
    for hist in (cpu_history, gpu_history, time_history):
        if len(hist) > window_size:
            del hist[:len(hist) - window_size]

    # Detect current theme (Gradio inherits OS theme)
    is_dark = is_dark_mode_enabled()
    bg_color = "#0b0f19" if is_dark else "#FFFFFF"
    fg_color = "#f0f0f0" if is_dark else "#000000"

    # Plot
    fig, ax = plt.subplots(figsize=(8,4))
    fig.patch.set_facecolor(bg_color)
    ax.grid()
    ax.set_facecolor(bg_color)
    ax.tick_params(colors=fg_color)
    ax.yaxis.label.set_color(fg_color)
    ax.xaxis.label.set_color(fg_color)
    for spine in ax.spines.values():
        spine.set_edgecolor(fg_color)
    ax.plot(time_history, cpu_history, label="CPU Util.", color="deepskyblue", linewidth=2)
    ax.plot(time_history, gpu_history, label="GPU Util.", color="magenta", linewidth=2)
    ax.set_ylim(0, 100)
    ax.set_xticks([])
    ax.set_ylabel("Percentage [%]", fontsize=fontsize)
    ax.set_xlabel("Time", fontsize=fontsize)    
    legend = ax.legend(
        loc='upper center',
        bbox_to_anchor=(0.5, 1.15),
        ncol=2,
        fontsize=fontsize,
        frameon=False
        )
    frame = legend.get_frame()
    frame.set_facecolor(bg_color)
    frame.set_edgecolor(fg_color)
    for text in legend.get_texts():
        text.set_color(fg_color)

    return fig

def live_plot():
    while True:
        yield get_stats()
        time.sleep(1)

# 5. Creating the Expert Assistant

In [8]:
import gradio as gr

# Default model
MODEL_OPTIONS = detect_llms()

TONE_OPTIONS = [
    ("😀 Formal", "formal"),
    ("😎 Casual", "casual"),
    ("😊 Friendly", "friendly"),
    ("🥰 Sweet", "sweet & lovely"),
    ("😏 Sarcastic", "sarcastic"),
    ("😜 Snarky", "snarky"),
    ("😤 Impatient", "impatient"),
    ("😒 Condescending", "condescending"),
    ("😡 Disrespectful", "disrespectful"),
]

VOICE_OPTIONS = [
    ("👨‍🦰 Onyx (Male - Deep, calm)", "onyx"),
    ("👨‍🦱 Echo (Male - Crisp, energetic)", "echo"),
    ("👨‍🦲 Alloy (Male - Friendly, upbeat)", "alloy"),
    ("👩‍🦰 Fable (Female - Warm, storytelling)", "fable"),
    ("👩‍🦳 Nova (Female - Clear, bright)", "nova"),
    ("👩‍🦱 Shimmer (Female - Smooth, soft)", "shimmer"),
]

TEMPERATURE_OPTIONS = [
    ("🧠 Precise, deterministic answer", "conservative"),
    ("🎨 Creative, imaginative answer", "creative")
]


# Create the interface with a radio input
with gr.Blocks(
    theme=gr.themes.Origin(),
    css="""
    #log_output {
    height: 240px;
    overflow-y: auto;
    border: 1px solid black;
    font-family: monospace;
    white-space: pre-wrap;
    }

    /* Hide the theme toggle button */
    .gr-theme-toggle,
    .gr-theme-toggle * {
        all: unset !important;
        display: none !important;
        width: 0 !important;
        height: 0 !important;
        overflow: hidden !important;
        pointer-events: none !important;
        visibility: hidden !important;
        margin: 0 !important;
        padding: 0 !important;
    }

    .gr-box > .gr-prose > label:has-text("Plot") {
        display: none !important;
    }
    """) as ui:     

    gr.Markdown("# 🎥 Video/Audio Tech Expert Assistant")
    
    with gr.Row():

        # Chat box
        chatbot = gr.Chatbot(label=" ", height=300, type="messages")

    with gr.Row():

        # User input box
        user_input = gr.Textbox(label="📝 Write your message", placeholder="Type your message here and press Enter.")
    
    with gr.Row():

        # Reset Button
        clear_btn = gr.Button("🔄 Clear Chat")
        disable_audio_btn = gr.Button("🔈 Enable Audio")

    with gr.Row():
        
        # Mode selector
        model_selector = gr.Dropdown(
            choices=MODEL_OPTIONS,
            value='qwen3:8b' if 'qwen3:8b' in [name for name, _ in MODEL_OPTIONS] else MODEL_OPTIONS[0][0],
            label="🤖 Select Language model"
        )

        # Tone selector
        tone_selector = gr.Dropdown(
            choices=TONE_OPTIONS,  
            value="formal",
            label="🎭Select Tone"
        )

        temperature_selector = gr.Dropdown(
            choices=TEMPERATURE_OPTIONS,
            value="conservative",
            label="🗣️ Select Response Mode"
        )

        voice_selector = gr.Dropdown(
            choices=VOICE_OPTIONS,
            value="onyx",
            label="🎤 Assistant Voice"
        )
        
        # Mic input
        mic_input = gr.Audio(label="🎙️ Speak", type="filepath", interactive=True, sources=['microphone'])
    
    with gr.Row():

        # Logger        
        log_output = gr.HTML(elem_id="log_output")            
        ui.load(fn=stream_logs, inputs=[], outputs=log_output)

        # CPU and GPU utilization
        with gr.Column():            
            plot_output = gr.Plot(label=" ")
        ui.load(fn=live_plot, inputs=[], outputs=plot_output)
    

    #aux = gr.Textbox(visible=False)
    lang_chain_state = gr.State({
        "chain": None,
        "model": None,
        "tone": None,
        "mode": None,
    })
    history_state = gr.State([])
    answer_state = gr.State([]) #gr.Textbox(visible=False)
    audio_enabled = gr.State(False)
    #theme_toggle = gr.Checkbox(label="Dark Theme", value=True)

    # When user submits by typing
    user_input.submit(
        fn=question_fn,
        inputs=[user_input, history_state],
        outputs=[chatbot, history_state, user_input]
    ).then(
        fn=answer_fn,
        inputs=[history_state, model_selector, tone_selector, temperature_selector, lang_chain_state],
        outputs=[chatbot, answer_state, lang_chain_state]
    ).then(
        fn=lambda answer, voice, enabled: handle_speech(answer, voice) if enabled else None,
        inputs=[answer_state, voice_selector, audio_enabled],
    ).then(
        fn=reset_mic_input,
        inputs=None,
        outputs=mic_input
    )

    # When user speaks
    mic_input.change(
        fn=handle_audio,
        inputs=mic_input,
        outputs=user_input
    )
    
    # Clear button action
    clear_btn.click(
        fn=clear_chat,
        inputs=None,
        outputs=[chatbot, history_state, user_input, lang_chain_state]
    )

    disable_audio_btn.click(
        fn=toggle_audio,
        inputs=[audio_enabled],
        outputs=[mic_input, disable_audio_btn, audio_enabled]
    )

ui.launch(inbrowser=True, inline=False, height=1000)

<span style="color:forestgreen; font-weight:bold;">[INFO]</span> Starting Ollama server...
<span style="color:forestgreen; font-weight:bold;">[INFO]</span> Ollama is now running.


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [9]:
import pandas as pd
import plotly.graph_objects as go

df = pd.DataFrame({
    '': ['deepseek-r1:8b', 'gemma3:4b', 'gpt-4o-mini', 'llama3:8b', 'mistral:7b', 'olmo2:7b', 'qwen3:8b'],
    'Expert Database (RAG)': ['VR/360° video', 'AV1', 'VVC', 'VVC', 'HEVC', 'VVC', 'VVC'],
    'Internal Knowledge': ['AI-driven coding', '8K', '8K', 'VVC', 'VVC', 'VVC', 'AI-driven coding']
})
df

# Set top-left cell (first header) to empty
df.columns = ["", "1️⃣ RAG-Based Answer", "2️⃣ Internal Knowledge Answer"]
header_values = list(df.columns)
header_values[0] = ""  # Blank top-left cell

# Colors
header_color = '#0077B5'
first_col_color = header_color
other_cells_color = 'white' #'gainsboro'

# Fill colors per column (rows x cols structure for Plotly)
cell_fill_colors = [
    [first_col_color] * len(df),        # First column
    [other_cells_color] * len(df),      # Second column
    [other_cells_color] * len(df)       # Third column
]

# Font color per column
cell_font_colors = [
    ['white'] * len(df),
    ['black'] * len(df),
    ['black'] * len(df)
]

# Header fill colors — set first cell to white to hide it
header_fill_colors = ['white', header_color, header_color]

# Header font colors — set first cell to white to hide text
header_font_colors = ['white', 'white', 'white']

# Header line colors — set first cell to white to hide border
header_line_colors = ['white', 'black', 'black']

# Plotly Table
fig = go.Figure(data=[go.Table(
    columnwidth=[110, 195, 195], 
    header=dict(
        values=header_values,
        fill_color=header_fill_colors,
        font=dict(color=header_font_colors, size=12),
        align='center',
        line=dict(color=header_font_colors, width=1)
    ),
    cells=dict(
        values=[df[col] for col in df.columns],
        fill_color=cell_fill_colors,
        font=dict(color=cell_font_colors),
        align='center',
        line=dict(color='white', width=1)
    )
)])

# Title
fig.update_layout(
    title_text="Predicted High-Impact Video Technologies by LLMs",
    title_x=0.5
)

#fig.write_image("high_impact_video_tech_llm_3_.png", scale=2)


In [10]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({
    'LLM': ['deepseek-r1:8b', 'gemma3:4b', 'gpt-4o-mini', 'llama3:8b', 'mistral:7b', 'olmo2:7b', 'qwen3:8b'],
    'Expert Database (RAG)': ['VR/360° video', 'AV1', 'VVC', 'VVC', 'HEVC', 'VVC', 'VVC'],
    'Internal Knowledge': ['AI-driven coding', '8K', '8K', 'VVC', 'VVC', 'VVC', 'AI-driven coding']
})
df

Unnamed: 0,LLM,Expert Database (RAG),Internal Knowledge
0,deepseek-r1:8b,VR/360° video,AI-driven coding
1,gemma3:4b,AV1,8K
2,gpt-4o-mini,VVC,8K
3,llama3:8b,VVC,VVC
4,mistral:7b,HEVC,VVC
5,olmo2:7b,VVC,VVC
6,qwen3:8b,VVC,AI-driven coding


In [11]:
# Create a plot
fig, ax = plt.subplots(figsize=(8, 2))  # Adjust size as needed
ax.axis('off')
table = ax.table(cellText=df.values, colLabels=df.columns, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)  # Adjust for better spacing

# Save as PNG
plt.savefig("dataframe_table.png", bbox_inches='tight', dpi=300)
plt.close()

In [12]:
import pandas as pd
import plotly.graph_objects as go

fig = go.Figure(data=[go.Table(
    header=dict(values=list(df.columns),
                fill_color='#0077B5',
                align='center',
                line=dict(color='black', width=1),
                font=dict(color='white', size=12)
                ),
    cells=dict(values=[df[col] for col in df.columns],
               fill_color='white',
               align='center',
               line=dict(color='black', width=1))
               )
])

# Add a title to the figure
fig.update_layout(
    title_text="Predicted High-Impact Video Technologies by LLMs",  # Replace with your desired title
    title_x=0.5  # Center the title
)

#fig.write_image("high_impact_video_tech_llm_.png", scale=2)