In [None]:
# !pip install chromadb google-genai sentence_transformers

In [None]:
import os
import chromadb
import pandas as pd
from google import genai
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

In [None]:
df = pd.read_csv("cybersecurity_threat_detection_logs.csv")
df = df.head(10)
# Convert ALL columns to string type to handle mixed data (numbers, dates, text)
df = df.astype(str).fillna('')

def chunk_logs(df, chunk_size=5):
    chunks = []
    for i in range(0, len(df), chunk_size):
        subset = df.iloc[i:i+chunk_size]
        text_block = "\n".join(
            subset.apply(lambda x: x.str.cat(sep=' | '), axis=1)
        )
        chunks.append(text_block)
    return chunks

sentences = chunk_logs(df)

In [None]:
# chromadb client
chroma_client = chromadb.Client()

# chromdb collection
collection = chroma_client.create_collection(name="logs_collection")

In [None]:
# add/update embeddings in the collection
collection.upsert(
    ids=[f"id{i}" for i in range(0, len(sentences))],
    documents=sentences,
    metadatas=[{"row_index": i} for i in range(len(sentences))]
)

In [None]:
# gemini client
load_dotenv()
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

In [None]:
# model to encode the user question
model = SentenceTransformer("thenlper/gte-small")

In [None]:
# rag function
def rag_query(question: str) -> str:
    # Step 1: embed query
    query_embedding = model.encode([question]).tolist()

    # Step 2: retrieve top-k docs
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=3,
        include=['documents','metadatas']
    )

    retrieved_docs = results["documents"][0]
    # build context
    context = "\n\n".join(retrieved_docs)

    # Step 3: build prompt
    prompt = f"""
    You are a cybersecurity threat analyst. You are given structured security log excerpts below.
    Analyze them carefully to answer the user query with precise, factual evidence only.
    Avoid assumptions; if uncertain, say "insufficient evidence".

    Context:
    {context}

    Question: {question}

    Instructions:
    1. Identify timestamps and event types relevant to the question.
    2. Use only context to determine threat severity or pattern.
    3. Summarize clearly and concisely.

    Answer:
    """

    # Step 4: call Gemini API
    response = client.models.generate_content(
        model = "gemini-2.5-flash",
        contents = prompt
    )
    return response.text.strip()


In [None]:
query = input("Ask a question: ")
print(rag_query(query))