<a href="https://colab.research.google.com/github/seconlon99/EUProjGeo/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installations

In [1]:
!pip install --upgrade google-generativeai
!pip install chromadb
!pip install dotenv



Imports

In [2]:
import numpy as np
import pandas as pd
import random
import math
import time
import json # For handling function call arguments and results

import google.generativeai as genai
# IMPORTANT: We only need to import FunctionDeclaration
from google.generativeai.types import FunctionDeclaration
from google.generativeai.types import Tool # Available in types
from google.generativeai import protos # FunctionDeclaration and Schema are in protos

# NEW: Import the generative_models module for lower-level content construction
import google.generativeai.generative_models as glm # <<<--- ADD THIS IMPORT
import os
from dotenv import load_dotenv
import chromadb
from chromadb.utils import embedding_functions

#connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# API Key setup
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY') # Make sure you have set GOOGLE_API_KEY in colab userdata

# Configure the Gemini API
genai.configure(api_key=GOOGLE_API_KEY)

CSV_FILE_PATH = "/content/drive/My Drive/MDA/merged_organization_project_data.csv" # <--- IMPORTANT: Change this to your actual CSV file path
CHROMA_DB_PATH = "/content/drive/My Drive/MDA/chroma_db"
COLLECTION_NAME = "project_data"
EMBEDDING_MODEL_NAME = "text-embedding-004"
GENERATION_MODEL_NAME = 'gemini-1.5-pro' # Use gemini-1.5-pro or gemini-1.5-flash
TOP_K_RESULTS = 5 # Number of relevant chunks to retrieve from the vector DB
MAX_HISTORY_TURNS = 5 # Keep only the last N user-assistant turns

Here the columns need to be adjusted based on what we want to include, also their names could be adjusted based on what they mean to improve performance of the vector based storage

In [31]:
# --- 2. Helper Functions ---

def create_text_from_row(row: pd.Series) -> str:
    """
    Creates a descriptive text string from a DataFrame row for embedding.
    You can customize this to include the most relevant columns from your CSV.
    """
    # Example: Concatenate key information
    # Adjust column names based on your actual CSV
    city = row.get('city', 'N/A')
    contact_form = row.get('contactForm', 'N/A')
    content_update_date_x = row.get('contentUpdateDate_x', 'N/A')
    country = row.get('country', 'N/A')
    ec_contribution = row.get('ecContribution', 'N/A')
    geolocation = row.get('geolocation', 'N/A')
    name = row.get('name', 'N/A')
    net_ec_contribution = row.get('netEcContribution', 'N/A')
    organization_url = row.get('organizationURL', 'N/A')
    post_code = row.get('postCode', 'N/A')
    project_id = row.get('projectID', 'N/A')
    role = row.get('role', 'N/A')
    street = row.get('street', 'N/A')
    total_cost_x = row.get('totalCost_x', 'N/A')
    cultdist = row.get('cultdist', 'N/A')
    cultdist_std = row.get('cultdist_std', 'N/A')
    countryname = row.get('Countryname', 'N/A')
    ec_max_contribution = row.get('ecMaxContribution', 'N/A')
    ec_signature_date = row.get('ecSignatureDate', 'N/A')
    end_date = row.get('endDate', 'N/A')
    start_date = row.get('startDate', 'N/A')
    status = row.get('status', 'N/A')
    title = row.get('title', 'N/A')
    goal = row.get('goal', 'N/A')
    method = row.get('method', 'N/A')
    generalized_goal = row.get('generalized_goal', 'N/A')
    generalized_method = row.get('generalized method', 'N/A')
    tech_domain = row.get('tech_domain', 'N/A')
    strategic_method = row.get('strategic_method', 'N/A')
    standardized_domain_area = row.get('standardized_domain_area', 'N/A')
    standardized_method_area = row.get('standardized_method_area', 'N/A')
    final_domain = row.get('final_domain', 'N/A')
    final_method = row.get('final_method', 'N/A')
    broad_domain = row.get('broad_domain', 'N/A')
    broad_method = row.get('broad_method', 'N/A')

        # Safely convert numeric fields
    try:
        ec_contribution_val = float(ec_contribution)
    except (ValueError, TypeError):
        ec_contribution_val = 'N/A'

    try:
        net_ec_contribution_val = float(net_ec_contribution)
    except (ValueError, TypeError):
        net_ec_contribution_val = 'N/A'

    try:
        total_cost_x_val = float(total_cost_x)
    except (ValueError, TypeError):
        total_cost_x_val = 'N/A'

    try:
        ec_max_contribution_val = float(ec_max_contribution)
    except (ValueError, TypeError):
        ec_max_contribution_val = 'N/A'

    return (
        f"City: {city}, Contact Form: {contact_form}, "
        f"Content Update Date: {content_update_date_x}, Country: {country}, EC Contribution: {ec_contribution_val}, "
        f"Geolocation: {geolocation}, Name: {name}, Net EC Contribution: {net_ec_contribution}, "
        f"Organization URL: {organization_url}, "
        f"Post Code: {post_code}, Project ID: {project_id}, "
        f"Role: {role}, Street: {street}, Total Cost of the project: {total_cost_x}, "
        f"Cultural Distance: {cultdist}, Cultural Distance Std: {cultdist_std}, "
        f"Country Name: {countryname},"
        f"EC Max Contribution: {ec_max_contribution}, EC Signature Date: {ec_signature_date}, "
        f"End Date: {end_date},"
        f"Start Date: {start_date}, Status: {status}, Title: {title},"
        f"Goal: {goal}, Method: {method}, "
        f"Generalized Goal: {generalized_goal}, Generalized Method: {generalized_method}, Tech Domain: {tech_domain}, "
        f"Strategic Method: {strategic_method}, Standardized Domain Area: {standardized_domain_area}, "
        f"Standardized Method Area: {standardized_method_area}, Final Domain: {final_domain}, Final Method: {final_method}, "
        f"Broad Domain: {broad_domain}, Broad Method: {broad_method}"
    )

In [5]:
def setup_chroma_db(documents: list, ids: list, embedding_function_instance: embedding_functions.EmbeddingFunction):
    """
    Sets up ChromaDB, creates or gets a collection, and adds documents.
    """
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

    # Use the Google Generative AI embedding function directly
    # ChromaDB will use the configured genai.configure(api_key=...)
    # We pass the model name to ensure consistency
    embedding_func = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=GOOGLE_API_KEY, model_name=EMBEDDING_MODEL_NAME)

    try:
        collection = client.get_or_create_collection(
            name=COLLECTION_NAME,
            embedding_function=embedding_func # Use the configured embedding function
        )
        print(f"Collection '{COLLECTION_NAME}' created or retrieved.")
    except Exception as e:
        print(f"Error getting/creating collection: {e}")
        # If collection exists with different embedding function, delete and recreate
        try:
            client.delete_collection(name=COLLECTION_NAME)
            collection = client.get_or_create_collection(
                name=COLLECTION_NAME,
                embedding_function=embedding_func
            )
            print(f"Collection '{COLLECTION_NAME}' deleted and recreated with new embedding function.")
        except Exception as e:
            print(f"FATAL: Could not get or create collection. Error: {e}")
            return None

    # Check if data already exists to avoid re-embedding everything on every run
    if collection.count() < len(documents):
        print(f"Adding {len(documents)} documents to ChromaDB. This may take a while for 100k rows...")
        batch_size = 500 # Adjust batch size based on your system's memory and API rate limits
        for i in range(0, len(documents), batch_size):
            batch_docs = documents[i:i + batch_size]
            batch_ids = ids[i:i + batch_size]
            try:
                collection.add(
                    documents=batch_docs,
                    ids=batch_ids
                )
                print(f"Added batch {i//batch_size + 1}/{(len(documents)//batch_size)+1}")
            except Exception as e:
                print(f"Error adding batch starting at index {i}: {e}")
                # You might want to implement retry logic here

        print("Finished adding documents to ChromaDB.")
    else:
        print(f"ChromaDB already contains {collection.count()} documents. Skipping embedding.")

    return collection

In [6]:
def retrieve_relevant_info(query: str, collection, top_k: int = TOP_K_RESULTS) -> list:
    """
    Retrieves the most relevant documents from ChromaDB based on a query.
    """
    results = collection.query(
        query_texts=[query],
        n_results=top_k
    )
    # results['documents'][0] contains the actual text content of the retrieved chunks
    return results['documents'][0] if results and results['documents'] else []

In [62]:
def build_rag_prompt(user_query: str, retrieved_context_current: list, conversation_history_genai_format: list = None) -> str:
    """
    Constructs the prompt for the Gemini model, incorporating retrieved context and conversation history.
    The conversation_history_genai_format is expected to be a list of GenAI content objects.
    """
    context_str_current = "\n".join([f"- {doc}" for doc in retrieved_context_current])
    if not context_str_current:
        context_str_current = "No specific relevant information found for the current query."

    history_str = ""
    if conversation_history_genai_format:
        history_str = "\n--- Conversation History ---\n"
        for turn in conversation_history_genai_format:
            role = turn.get('role', 'unknown').capitalize()
            text_content = ""
            for part in turn.get('parts', []):
                if 'text' in part:
                    text_content += part['text'] + " "

            history_str += f"{role}: {text_content.strip()}\n"
        history_str += "--------------------------\n"

    prompt = f"""
    You are an expert assistant designed to answer questions about projects funded by the European Commission (EC).
    Your knowledge base includes details on institutions, countries, cultural distances, and specific project data.
    All project information is structured as 'Column Name: Value' pairs.

    --- Conversation History ---
    {history_str}
    --- End Conversation History ---

    --- Relevant Project Data (from RAG search for current query) ---
    {context_str_current}
    --- End Relevant Project Data ---

    Instructions:
    1.  **Prioritize Tools:** If a user's question requires a specific lookup, count, or calculation (e.g., "highest contribution," "projects in X city," "info on project ID Y"), you **must** call the appropriate tool.
    2.  **Use Provided Data (RAG Context):** For general questions or those not covered by tools, use the "Relevant Project Data" provided above.
    3.  **State Limitations:** If the answer is not available in the provided data or through any available tool, clearly state that you do not have sufficient information to answer the question.
    4.  **Be Concise and Factual:** Provide direct answers without fabricating information.

    ---
    {context_str_current}
    ---

    Question: {user_query}
    Answer:
    """
    return prompt

Here are some query functions

In [8]:
# These functions will operate on your DataFrame (df)
# We will pass df into these functions in the main loop
# Ensure columns used in these functions exist and are clean in your actual CSV data.

def get_projects_by_city(df: pd.DataFrame, city_name: str) -> str:
    """
    Retrieves a list of project IDs and organization names for projects located in a specified city.
    Args:
        df: The DataFrame containing project data.
        city_name: The name of the city to filter by.
    Returns:
        A string summarizing projects found or a message if none.
    """
    city_name = city_name.strip() # Clean input
    filtered_df = df[df['city'].astype(str).str.contains(city_name, case=False, na=False)]

    if filtered_df.empty:
        return f"No projects found in {city_name}."

    project_info = []
    for _, row in filtered_df.head(10).iterrows(): # Limit to first 10 for brevity
        project_id = row.get('projectID', 'N/A')
        org_name = row.get('name', 'N/A')
        project_info.append(f"Project ID: {project_id}, Org: {org_name}")

    total_count = len(filtered_df)
    summary = f"Found {total_count} projects in {city_name}. Here are some of them:\n" + "\n".join(project_info)
    if total_count > 10:
        summary += f"\n...and {total_count - 10} more."
    return summary

In [9]:
def count_projects_by_city(df: pd.DataFrame, city_name: str) -> str:
    """
    Counts the number of projects located in a specified city.
    Args:
        df: The DataFrame containing project data.
        city_name: The name of the city to count projects for.
    Returns:
        A string stating the count of projects.
    """
    city_name = city_name.strip()
    filtered_df = df[df['city'].astype(str).str.contains(city_name, case=False, na=False)]
    count = len(filtered_df)
    return f"There are {count} projects in {city_name}."

In [32]:
def get_highest_contribution_project(df: pd.DataFrame, year: int = None) -> str:
    """
    Finds the project with the highest EC Contribution, optionally filtered by a specific year.
    """
    temp_df = df.copy()

    temp_df['ecContribution_numeric'] = pd.to_numeric(temp_df['ecContribution'], errors='coerce')
    temp_df.dropna(subset=['ecContribution_numeric'], inplace=True)

    if 'endDate' in temp_df.columns:
        temp_df['end_year'] = pd.to_datetime(temp_df['endDate'], errors='coerce').dt.year
        temp_df.dropna(subset=['end_year'], inplace=True)
    else:
        temp_df['end_year'] = None

    if year is not None: # Check if year was provided
        try:
            year_int = int(year) # <--- Explicitly cast to integer here
        except (ValueError, TypeError):
            return "Invalid year provided. Please provide a valid integer year."

        if 'end_year' in temp_df.columns:
            temp_df = temp_df[temp_df['end_year'] == year_int] # Compare with integer year
        else:
            return f"Cannot filter by year {year_int}: 'endPeriod' column not found or invalid in data."

    if temp_df.empty:
        return f"No projects found {'in ' + str(year_int) + ' with valid EC Contribution' if year else 'with valid EC Contribution'}." # Adjusted message for year_int

    highest_contrib_project = temp_df.loc[temp_df['ecContribution_numeric'].idxmax()]

    project_id = highest_contrib_project.get('projectID', 'N/A')
    org_name = highest_contrib_project.get('name', 'N/A')
    ec_contribution = highest_contrib_project.get('ecContribution_numeric', 'N/A')
    city = highest_contrib_project.get('city', 'N/A')
    country = highest_contrib_project.get('country', 'N/A')
    end_year = int(highest_contrib_project.get('end_year', 'N/A')) if pd.notna(highest_contrib_project.get('end_year')) else 'N/A'

    return (
        f"The project with the highest EC Contribution {'in ' + str(year_int) if year is not None else 'overall'} is:\n"
        f"Project ID: {project_id}\n"
        f"Organization: {org_name}\n"
        f"EC Contribution: {ec_contribution}\n"
        f"City: {city}\n"
        f"Country: {country}\n"
        f"End Year: {end_year}" # CORRECTED: Reference End Year
    )

In [61]:
def get_highest_contribution_by_country(df: pd.DataFrame, country_name: str) -> str:
    """
    Finds the project with the highest EC Contribution in a specified country.
    Uses 'country' or 'Countryname' columns for filtering.
    """
    temp_df = df.copy()
    country_name_lower = country_name.strip().lower()

    temp_df['ecContribution_numeric'] = pd.to_numeric(temp_df['ecContribution'], errors='coerce')
    temp_df.dropna(subset=['ecContribution_numeric'], inplace=True)

    # Filter by 'country' or 'Countryname'
    filtered_df = temp_df[
        (temp_df['country'].astype(str).str.lower() == country_name_lower) |
        (temp_df['Countryname'].astype(str).str.lower().str.contains(country_name_lower, na=False))
    ]

    if filtered_df.empty:
        return f"No projects found with valid EC Contribution in {country_name}."

    highest_contrib_project = filtered_df.loc[filtered_df['ecContribution_numeric'].idxmax()]

    project_id = highest_contrib_project.get('projectID', 'N/A')
    project_name = highest_contrib_project.get('name', 'N/A')
    ec_contribution = highest_contrib_project.get('ecContribution_numeric', 'N/A')
    city = highest_contrib_project.get('city', 'N/A')
    country_found = highest_contrib_project.get('country', highest_contrib_project.get('Countryname', 'N/A'))

    return (
        f"The project with the highest EC Contribution in {country_found} is:\n"
        f"Project ID: {project_id}\n"
        f"Institution Name: {project_name}\n"
        f"EC Contribution: {ec_contribution}\n"
        f"City: {city}"
    )

In [52]:
def get_project_info_by_id(df: pd.DataFrame, project_id: str) -> str:
    """
    Retrieves detailed information for a project given its project ID.
    """
    # Ensure projectID column is treated as string for consistent comparison
    temp_df = df.copy()
    temp_df['projectID'] = temp_df['projectID'].astype(str)

    filtered_df = temp_df[temp_df['projectID'] == str(project_id).strip()]

    if filtered_df.empty:
        return f"No project found with ID: {project_id}."

    project_info = filtered_df.iloc[0] # Get the first (and hopefully only) matching row

    # Format the output with relevant details
    output = f"Project Details for ID: {project_info.get('projectID', 'N/A')}\n"
    output += f"  Name: {project_info.get('name', 'N/A')}\n"
    output += f"  Title: {project_info.get('title', 'N/A')}\n"
    output += f"  Status: {project_info.get('status', 'N/A')}\n"
    output += f"  EC Contribution: {project_info.get('ecContribution', 'N/A')}\n"
    output += f"  Net EC Contribution: {project_info.get('netEcContribution', 'N/A')}\n"
    output += f"  Total Cost: {project_info.get('totalCost_x', 'N/A')}\n"
    output += f"  Max EC Contribution: {project_info.get('ecMaxContribution', 'N/A')}\n"
    output += f"  Country: {project_info.get('Countryname', project_info.get('country', 'N/A'))}\n"
    output += f"  City: {project_info.get('city', 'N/A')}\n"
    output += f"  Start Date: {project_info.get('startDate', 'N/A')}\n"
    output += f"  End Date: {project_info.get('endDate', 'N/A')}\n"
    output += f"  Role: {project_info.get('role', 'N/A')}\n"
    output += f"  Goal: {project_info.get('goal', 'N/A')}\n"
    output += f"  Method: {project_info.get('method', 'N/A')}\n"
    output += f"  Tech Domain: {project_info.get('tech_domain', 'N/A')}\n"
    output += f"  Organization URL: {project_info.get('organizationURL', 'N/A')}\n"
    output += f"  Geolocation: {project_info.get('geolocation', 'N/A')}\n"
    # You can add more fields from your extensive list as needed.
    return output

In [54]:
#  --- NEW: Define tools for Gemini ---
# These are the declarations that Gemini will use to understand when to call your functions.
# Ensure the names, descriptions, and parameters match your Python functions.
tools = [
    Tool(
        function_declarations=[
            protos.FunctionDeclaration(
                name='get_projects_by_city',
                description='Retrieves a list of project IDs and project names for projects in a specified city.',
                parameters=protos.Schema(
                    type=protos.Type.OBJECT,
                    properties={
                        'city_name': protos.Schema(type=protos.Type.STRING, description='The name of the city (e.g., "London", "Paris").'),
                    },
                    required=['city_name'],
                ),
            )
        ]
    ),
    Tool(
        function_declarations=[
            protos.FunctionDeclaration(
                name='count_projects_by_city',
                description='Counts the total number of projects located in a specified city.',
                parameters=protos.Schema(
                    type=protos.Type.OBJECT,
                    properties={
                        'city_name': protos.Schema(type=protos.Type.STRING, description='The name of the city to count projects for (e.g., "London", "Paris").'),
                    },
                    required=['city_name'],
                ),
            )
        ]
    ),
    Tool(
        function_declarations=[
            protos.FunctionDeclaration(
                name='get_highest_contribution_project',
                description='Finds the project with the highest European Commission (EC) Contribution. Can optionally filter by end year.',
                parameters=protos.Schema(
                    type=protos.Type.OBJECT,
                    properties={
                        'year': protos.Schema(type=protos.Type.INTEGER, description='Optional: The specific year to filter projects by. For example, 2021.'),
                    },
                    required=[],
                ),
            )
        ]
    ),
    Tool(
        function_declarations=[
            protos.FunctionDeclaration(
                name='get_highest_contribution_by_country',
                description='Finds the project with the highest European Commission (EC) Contribution in a specified country.',
                parameters=protos.Schema(
                    type=protos.Type.OBJECT,
                    properties={
                        'country_name': protos.Schema(type=protos.Type.STRING, description='The name or code of the country (e.g., "Germany", "DE", "France").'),
                    },
                    required=['country_name'],
                ),
            )
        ]
    ),
    Tool(
        function_declarations=[
            protos.FunctionDeclaration(
                name='get_project_info_by_id',
                description='Retrieves detailed information about a project given its unique project ID.',
                parameters=protos.Schema(
                    type=protos.Type.OBJECT,
                    properties={
                        'project_id': protos.Schema(type=protos.Type.STRING, description='The unique ID of the project (e.g., "101113245").'),
                    },
                    required=['project_id'],
                ),
            )
        ]
    ),
]



In [55]:
def format_history_for_chat(history_list: list):
    formatted_chat_history = []
    for turn in history_list:
        # User messages
        formatted_chat_history.append({
            "role": "user",
            "parts": [{"text": turn["user_query"]}]
        })
        # Assistant messages
        formatted_chat_history.append({
            "role": "model",
            "parts": [{"text": turn["assistant_response"]}]
        })
    return formatted_chat_history

In [65]:
def main():
    # --- Load Data ---
    print(f"Loading data from {CSV_FILE_PATH}...")
    try:
        # Using nrows=400 as indicated by the user for testing
        df = pd.read_csv(CSV_FILE_PATH, sep=',', nrows=400)
        print(f"Loaded {len(df)} rows.")
        if 'projectID' not in df.columns:
            df['projectID'] = df.index # Use row index as ID if 'projectID' column is missing
            print("Warning: 'projectID' column not found, using row index as ID.")

        # Ensure numeric columns are numeric, converting errors to NaN
        df['ecContribution'] = pd.to_numeric(df['ecContribution'], errors='coerce')
        df['netEcContribution'] = pd.to_numeric(df['netEcContribution'], errors='coerce')
        df['totalCost_x'] = pd.to_numeric(df['totalCost_x'], errors='coerce')
        df['ecMaxContribution'] = pd.to_numeric(df['ecMaxContribution'], errors='coerce')

        # Ensure date columns are datetime objects
        df['endDate'] = pd.to_datetime(df['endDate'], errors='coerce')
        df['startDate'] = pd.to_datetime(df['startDate'], errors='coerce')
        df['contentUpdateDate_x'] = pd.to_datetime(df['contentUpdateDate_x'], errors='coerce')
        df['ecSignatureDate'] = pd.to_datetime(df['ecSignatureDate'], errors='coerce')

        # Fill N/A for string columns that are used in filtering if they might be missing
        for col in ['city', 'country', 'Countryname', 'name', 'title', 'projectID']: # Added projectID here for safety
            if col in df.columns:
                df[col] = df[col].fillna('').astype(str) # Fill NaN with empty string and ensure string type
            else:
                df[col] = '' # Create empty column if not present for consistency

    except FileNotFoundError:
        print(f"Error: {CSV_FILE_PATH} not found.")
        print("Please create a dummy CSV or provide the correct path to your 100k row CSV.")
        print("Creating a small dummy CSV for demonstration purposes...")

        dummy_data = {
            'projectID': [f'P{i:05d}' for i in range(100)],
            'name': [f'Project Name {i}' for i in range(100)],
            'title': [f'Project Title {i} (A Study on {i%5})' for i in range(100)],
            'city': ['London' if i % 5 == 0 else 'Paris' if i % 5 == 1 else 'Berlin' if i % 5 == 2 else 'Rome' if i % 5 == 3 else 'Madrid' for i in range(100)],
            'country': ['UK' if i % 5 == 0 else 'FR' if i % 5 == 1 else 'DE' if i % 5 == 2 else 'IT' if i % 5 == 3 else 'ES' for i in range(100)],
            'Countryname': ['United Kingdom' if i % 5 == 0 else 'France' if i % 5 == 1 else 'Germany' if i % 5 == 2 else 'Italy' if i % 5 == 3 else 'Spain' for i in range(100)],
            'ecContribution': [str(100000 + i * 1000) for i in range(100)],
            'netEcContribution': [str(90000 + i * 900) for i in range(100)],
            'totalCost_x': [str(200000 + i * 1500) for i in range(100)],
            'ecMaxContribution': [str(120000 + i * 1100) for i in range(100)],
            'endDate': [f'202{(i%5)+1}-12-31' for i in range(100)],
            'startDate': [f'202{(i%5)}-01-01' for i in range(100)],
            'status': ['Closed' if i % 2 == 0 else 'Open' for i in range(100)],
            'goal': [f'Goal {i}' for i in range(100)],
            'method': [f'Method {i}' for i in range(100)],
            'geolocation': [f'{50 + i/100},{10 + i/100}' for i in range(100)],
            'postCode': [f'PC{i}' for i in range(100)],
            'street': [f'Street {i}' for i in range(100)],
            'contactForm': [f'Form{i}' for i in range(100)],
            'organizationURL': [f'http://org{i}.com' for i in range(100)],
            'contentUpdateDate_x': [f'202{(i%5)+1}-01-01' for i in range(100)],
            'ecSignatureDate': [f'202{(i%5)+1}-03-15' for i in range(100)],
            'cultdist': [str(0.5 + i/1000) for i in range(100)],
            'cultdist_std': [str(0.1 + i/2000) for i in range(100)],
            'role': ['Coordinator' if i % 4 == 0 else 'Partner' for i in range(100)],
            'generalized_goal': [f'Gen Goal {i%3}' for i in range(100)],
            'generalized_method': [f'Gen Method {i%3}' for i in range(100)],
            'tech_domain': [f'Tech Domain {i%4}' for i in range(100)],
            'strategic_method': [f'Strat Method {i%2}' for i in range(100)],
            'standardized_domain_area': [f'Std Domain {i%5}' for i in range(100)],
            'standardized_method_area': [f'Std Method {i%5}' for i in range(100)],
            'final_domain': [f'Final Domain {i%2}' for i in range(100)],
            'final_method': [f'Final Method {i%2}' for i in range(100)],
            'broad_domain': [f'Broad Domain {i%3}' for i in range(100)],
            'broad_method': [f'Broad Method {i%3}' for i in range(100)],
        }
        df = pd.DataFrame(dummy_data)
        df.to_csv(CSV_FILE_PATH, index=False)
        print(f"Created dummy CSV file: {CSV_FILE_PATH} with {len(df)} rows.")

        # Re-apply type conversions for dummy data
        df['ecContribution'] = pd.to_numeric(df['ecContribution'], errors='coerce')
        df['netEcContribution'] = pd.to_numeric(df['netEcContribution'], errors='coerce')
        df['totalCost_x'] = pd.to_numeric(df['totalCost_x'], errors='coerce')
        df['ecMaxContribution'] = pd.to_numeric(df['ecMaxContribution'], errors='coerce')

        df['endDate'] = pd.to_datetime(df['endDate'], errors='coerce')
        df['startDate'] = pd.to_datetime(df['startDate'], errors='coerce')
        df['contentUpdateDate_x'] = pd.to_datetime(df['contentUpdateDate_x'], errors='coerce')
        df['ecSignatureDate'] = pd.to_datetime(df['ecSignatureDate'], errors='coerce')

        # Fill N/A for string columns for dummy data as well
        for col in ['city', 'country', 'Countryname', 'name', 'title', 'projectID']:
            if col in df.columns:
                df[col] = df[col].fillna('').astype(str) # Fill NaN with empty string and ensure string type


    # Prepare documents for ChromaDB
    documents = [create_text_from_row(row) for index, row in df.iterrows()]
    ids = [f"{str(row['projectID'])}_{index}" for index, row in df.iterrows()]

    # --- Setup ChromaDB ---
    print("Setting up ChromaDB...")
    google_embedding_function = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
        api_key=GOOGLE_API_KEY, model_name=EMBEDDING_MODEL_NAME
    )
    collection = setup_chroma_db(documents, ids, google_embedding_function)

    if collection is None:
        print("Failed to set up ChromaDB. Exiting.")
        return

    # --- Initialize Gemini for Generation with Tools ---
    generation_model = genai.GenerativeModel(GENERATION_MODEL_NAME, tools=tools)
    print(f"Initialized Gemini model: {GENERATION_MODEL_NAME} with tools.")

    # This 'conversation_history_for_rag' will be for *our* manual tracking and building RAG prompt history,
    # NOT directly for `start_chat` unless formatted.
    conversation_history_for_rag = []

    print("\nChatbot is ready! Ask questions about your project data.")
    print("Try: 'How many projects are in London?', 'What is the highest paid project?', 'What is the highest paid project in 2024?'")
    print("NEW: 'What is the highest paid project in Germany?', 'Tell me about project P00001'")
    print("Type 'exit' or 'quit' to end the session.")

    while True:
        user_query = input("\nYour question: ").strip()
        if user_query.lower() in ['exit', 'quit']:
            print("Goodbye!")
            break

        # Format history for start_chat at the beginning of each turn
        # This will convert our simple dicts to the required role/parts format
        formatted_chat_history = format_history_for_chat(conversation_history_for_rag)
        chat_session = generation_model.start_chat(history=formatted_chat_history)

        assistant_response = "I encountered an error." # Default in case of issues
        retrieved_context_current = [] # Initialize for current turn

        try:
            # Send the user's message to the chat session
            response = chat_session.send_message(user_query)

            # --- Tool Call Handling Logic ---
            # This handles a single tool call and its response, or a text response.

            if response.candidates and response.candidates[0].content.parts[0].function_call:
                tool_call = response.candidates[0].content.parts[0].function_call
                tool_name = tool_call.name
                tool_args = {k: v for k, v in tool_call.args.items()}

                print(f"Tool Call detected: {tool_name} with args {tool_args}")

                tool_output = None
                try:
                    if tool_name == 'get_projects_by_city':
                        tool_output = get_projects_by_city(df, **tool_args)
                    elif tool_name == 'count_projects_by_city':
                        tool_output = count_projects_by_city(df, **tool_args)
                    elif tool_name == 'get_highest_contribution_project':
                        tool_output = get_highest_contribution_project(df, **tool_args)
                    elif tool_name == 'get_highest_contribution_by_country':
                        tool_output = get_highest_contribution_by_country(df, **tool_args)
                    elif tool_name == 'get_project_info_by_id': # Handle new tool
                        tool_output = get_project_info_by_id(df, **tool_args)
                    else:
                        tool_output = f"Error: Unknown tool_code function: {tool_name}"

                    print(f"Tool Output: {tool_output}")

                    # Send the tool output back to the *same chat session*
                    tool_response_payload = {
                        "parts": [
                            {
                                "function_response": {
                                    "name": tool_name,
                                    "response": {"result": tool_output}
                                }
                            }
                        ]
                    }
                    # Get the model's response AFTER receiving the tool output
                    final_model_response_after_tool = chat_session.send_message(tool_response_payload)

                    # --- Handle model's response after tool execution ---
                    if final_model_response_after_tool.candidates and final_model_response_after_tool.candidates[0].content:
                        first_part = final_model_response_after_tool.candidates[0].content.parts[0]

                        if hasattr(first_part, 'text'): # It's a text response
                            assistant_response = first_part.text
                        elif hasattr(first_part, 'function_call'): # It's another tool call (chained call)
                            chained_tool_call = first_part.function_call
                            assistant_response = (f"Model requested another tool call: {chained_tool_call.name} "
                                                  f"with args {chained_tool_call.args}. "
                                                  f"The system is not yet set up for recursive tool calls in this scenario. "
                                                  f"Please rephrase your question or consider extending the tool handling logic.")
                        else:
                            assistant_response = "Model returned an unrecognized content type after tool execution."
                    else:
                        assistant_response = "Model did not return valid content after tool execution."

                except Exception as tool_e:
                    assistant_response = f"An error occurred while executing the tool ({tool_name}): {tool_e}"
                    print(assistant_response)

            else:
                # No tool call from the model, so proceed with RAG
                print(f"Searching for relevant information for: '{user_query}'...")
                retrieved_context_current = retrieve_relevant_info(user_query, collection, TOP_K_RESULTS)
                print(retrieved_context_current)

                if not retrieved_context_current:
                    print("No relevant context found for current query. Answering based on history or general knowledge.")

                final_rag_prompt = build_rag_prompt(user_query, retrieved_context_current, conversation_history_for_rag)

                # Send the RAG prompt *to the chat session*
                rag_response = chat_session.send_message(final_rag_prompt)
                assistant_response = rag_response.text

            print("\nChatbot:", assistant_response)

        except Exception as e:
            print(f"Error generating response from Gemini: {e}")
            print("Please check your API key, model limits, or prompt content.")
            assistant_response = "I encountered an error trying to generate a response."

        # --- Update conversation_history_for_rag for next loop iteration ---
        # Truncate history (both user and model turns) if it exceeds MAX_HISTORY_TURNS
        if len(conversation_history_for_rag) >= MAX_HISTORY_TURNS:
            conversation_history_for_rag.pop(0)

        # Append the current user query and assistant response for future RAG prompt building
        conversation_history_for_rag.append({
            "user_query": user_query,
            "assistant_response": assistant_response,
            "retrieved_context": retrieved_context_current
        })

if __name__ == "__main__":
    main()

Loading data from /content/drive/My Drive/MDA/merged_organization_project_data.csv...
Loaded 400 rows.
Setting up ChromaDB...
Collection 'project_data' created or retrieved.
ChromaDB already contains 400 documents. Skipping embedding.
Initialized Gemini model: gemini-1.5-pro with tools.

Chatbot is ready! Ask questions about your project data.
Try: 'How many projects are in London?', 'What is the highest paid project?', 'What is the highest paid project in 2024?'
NEW: 'What is the highest paid project in Germany?', 'Tell me about project P00001'
Type 'exit' or 'quit' to end the session.

Your question: What is the cultural distince between Germany and Austria?
Searching for relevant information for: 'What is the cultural distince between Germany and Austria?'...
['City: Wien, Contact Form: https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/contact-form/project/901064817/101130898, Content Update Date: 2023-10-13 14:41:04, Country: AT, EC Contribution: 258312.5, Geolo