# RAG Architecture Test

## Preparation

### Import Libraries and Credentials

In [8]:
import os
import config  # Import the config file
import requests
import duckdb
import google.generativeai as genai
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Set environment variables using values from config.py
os.environ["gemini_api_key"] = config.gemini_api
# Environment variables for PostgreSQL connection
gemini_api_key = os.getenv("gemini_api_key")

### Download Prepared Data

In [6]:
# 1. Connect to (or create) the DuckDB database
con = duckdb.connect("combined_emission_data.duckdb")


df_prepared_data = con.execute("SELECT * FROM combined_data").fetchdf()


con.close()

df_prepared_data

Unnamed: 0,rag_summary,embeddings
0,[TYPE: CARBON DATA]\nDocument: Full Carbon Dat...,"[-0.022816572338342667, 0.0530795119702816, -0..."
1,[TYPE: CARBON DATA]\nDocument: Full Carbon Dat...,"[-0.02335883118212223, 0.03286804258823395, -0..."
2,[TYPE: CARBON DATA]\nDocument: Full Carbon Dat...,"[-0.023150034248828888, 0.03248152881860733, -..."
3,[TYPE: CARBON DATA]\nDocument: Full Carbon Dat...,"[-0.018603665754199028, 0.04711500555276871, -..."
4,[TYPE: CARBON DATA]\nDocument: Full Carbon Dat...,"[-0.02122417651116848, 0.041178856045007706, -..."
...,...,...
5870,[TYPE: NEWS ARTICLE]\nDocument: Climate & Emis...,"[0.04372638091444969, 0.07870461791753769, 0.0..."
5871,[TYPE: NEWS ARTICLE]\nDocument: Climate & Emis...,"[-0.016487013548612595, -0.07614518702030182, ..."
5872,[TYPE: NEWS ARTICLE]\nDocument: Climate & Emis...,"[0.01761428453028202, 0.03581300377845764, 0.0..."
5873,[TYPE: NEWS ARTICLE]\nDocument: Climate & Emis...,"[0.012168467044830322, -0.0007799360901117325,..."


## Run the RAG

## Define LLM Model

In [14]:
# Replace with your API key
genai.configure(api_key=gemini_api_key)

model = genai.GenerativeModel("gemini-1.5-flash")


## Define the RAG Functions

In [15]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

document_embeddings = np.vstack(df_prepared_data["embeddings"].to_numpy())

def retrieve_context(query, top_k=3):
    query_embedding = embedder.encode([query])
    similarities = cosine_similarity(query_embedding, document_embeddings)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    return df_prepared_data['rag_summary'].iloc[top_indices].tolist()

def build_prompt(contexts, query):
    context_text = "\n".join(f"- {ctx}" for ctx in contexts)
    return f"""You are a helpful assistant. Use the following context to answer the question.

Context:
{context_text}

Question: {query}
Answer:"""

def ask_rag_with_gemini(query):
    contexts = retrieve_context(query)
    prompt = build_prompt(contexts, query)
    response = model.generate_content(prompt)
    return response.text.strip()


## Prompt the RAG

In [16]:
question = "What emission-related news was published about germany? give me the link as well and tell me when it is published"
answer = ask_rag_with_gemini(question)
print(answer)



There is no emission-related news specifically about Germany in the provided context.  The news from Germany concerns the automotive e-drive market, but doesn't focus on emissions.

The link is: https://www.openpr.com/news/3944732/e-drive-for-automotive-market-exclusive-report-on-the-latest

It was published at: 2025-03-28 10:15:37
