This notebook contains the code to ingest and update RAG corpus and implement confidence score functionality

In [None]:
# Install required dependencies
%pip install --upgrade --quiet google-cloud-aiplatform google-genai

# Set up authentication
from google.cloud import aiplatform
from google.oauth2 import service_account
import os

# Initialize Vertex AI
PROJECT_ID = "<>"
REGION = "us-central1"

aiplatform.init(
    project=PROJECT_ID,
    location=REGION
)

#RAW_PATHS    = [
#    "gs://my_bucket/news_2025_07/",          # TODO: your GCS folder
#    "https://drive.google.com/drive/folders/1abcDEFgHIj"  # optional Drive folder
#]

RAW_PATHS    = [
    "https://drive.google.com/drive/folders/1NHNSkg_TVA99IpqfyOlHdxzCQtXg-ag9?usp=sharing"
]



In [None]:
from google.genai.types import GenerateContentConfig, Retrieval, Tool, VertexRagStore
from vertexai import rag

In [None]:
# Currently supports Google first-party embedding models
EMBEDDING_MODEL = "publishers/google/models/text-embedding-005"  # @param {type:"string", isTemplate: true}

rag_corpus = rag.create_corpus(
    display_name="my-trufeed-corpus",
    backend_config=rag.RagVectorDbConfig(
        rag_embedding_model_config=rag.RagEmbeddingModelConfig(
            vertex_prediction_endpoint=rag.VertexPredictionEndpoint(
                publisher_model=EMBEDDING_MODEL
            )
        )
    ),
)


In [None]:
if RAW_PATHS:
    op = rag.import_files(
        rag_corpus.name,                  # ← use the freshly-created corpus
        RAW_PATHS,
        transformation_config=rag.TransformationConfig(
            chunking_config=rag.ChunkingConfig(
                chunk_size=512,
                chunk_overlap=100,
            )
        ),
        max_embedding_requests_per_min=1_000,  # throttle if very large
    )
    resp = op
    print("📚 Import complete!  ✅")
else:
    print("➡️  RAW_PATHS empty – skipping import")

📚 Import complete!  ✅


Testing rag Submission

In [None]:
response = rag.retrieval_query(
    rag_resources=[
        rag.RagResource(
            rag_corpus=rag_corpus.name,
            # Optional: supply IDs from rag.list_files().
            # rag_file_ids=["rag-file-1", "rag-file-2", ...],
        )
    ],
    rag_retrieval_config=rag.RagRetrievalConfig(
        top_k=10,  # Optional
        filter=rag.Filter(
            vector_distance_threshold=0.5,  # Optional
        ),
    ),
    text="What is principles and why it is helpful?",
)
print(response)

contexts {
  contexts {
    source_uri: "https://drive.google.com/file/d/110f6_vAaXZeionxWeoTTMYb4Py1Mwdqv/view?usp=drivesdk"
    text: "IFCN CODE OF PRINCIPLES  (extract)\r\n\r\n1. Non-partisanship & Fairness  \r\n2. Transparency of Sources  \r\n3. Transparency of Funding & Organisation  \r\n4. Transparency of Methodology (publish the fact-check process)  \r\n5. Open & Honest Corrections"
    source_display_name: "ifcn_code.txt"
    score: 0.46018200972185153
    chunk {
      text: "IFCN CODE OF PRINCIPLES  (extract)\r\n\r\n1. Non-partisanship & Fairness  \r\n2. Transparency of Sources  \r\n3. Transparency of Funding & Organisation  \r\n4. Transparency of Methodology (publish the fact-check process)  \r\n5. Open & Honest Corrections"
    }
  }
  contexts {
    source_uri: "https://drive.google.com/file/d/1TjWFtIDKoPsgZ0LxV8D2CWNimewULSaq/view?usp=drivesdk"
    text: "REUTERS TRUST PRINCIPLES  (abridged)\r\n\r\n1. Accuracy • Every story is fact-checked; errors corrected promptly and t

In [None]:
from vertexai.generative_models import GenerativeModel, Tool

retrieval_tool = Tool.from_retrieval(
    retrieval=rag.Retrieval(
        source=rag.VertexRagStore(
            rag_resources=[rag.RagResource(rag_corpus=rag_corpus.name)],
            rag_retrieval_config=retrieval_cfg,
        )
    )
)

gemini = GenerativeModel(model_name="gemini-2.0-flash", tools=[retrieval_tool])

response = gemini.generate_content(
    """
> You are an AI assistant trained in fact verification and media integrity evaluation. You will receive a **cluster of user-submitted event data** — including images, text summaries, locations, and metadata — that has been aggregated from social media or public sources.
>
> Your job is to:
>
> 1. **Compare this data** with the knowledge provided from authoritative sources (retrieved below).
> 2. **Apply core journalistic principles** such as: corroboration, source triangulation, factual consistency, bias detection, and plausibility.
> 3. **Assign a confidence score (0 to 100)** that reflects how likely it is that the reported event is **true and factual**.
>
> ### Use these scoring tiers:
>
> * **90–100**: Strong alignment with multiple sources, highly factual.
> * **70–89**: Partial alignment, but missing full confirmation or lacking some detail.
> * **50–69**: Possible but unconfirmed; unclear or speculative.
> * **30–49**: Conflicts with verified sources or contains questionable claims.
> * **0–29**: Very likely to be false, manipulated, or entirely unsubstantiated.

---

###  Evaluate Based on:

* Does the location/time match verified records?
* Are similar reports found in news, civic data, or government feeds?
* Does the tone resemble typical real-world incident reporting?
* Are there red flags like exaggerated sentiment, inconsistency, or clickbait language?

---

###  Input:

**Clustered Event Data**:

```
{cluster_data}
```

**Retrieved Source Data (RAG)**:

```
{retrieved_documents}
```

---

###  Output Format:

```json
{
  "confidence_score": 87,
  "reasoning": "The event aligns with 3 independent sources including local news reports and official alerts. The tone is consistent with civic incident reporting, and the location matches verified maps."
}
```

"""
)
print(response.text)

In [None]:
def rag_ask(question: str, top_k: int = 4, model="gemini-2.0-flash"):
    cfg   = rag.RagRetrievalConfig(top_k=top_k)
    tool  = Tool.from_retrieval(
        retrieval=rag.Retrieval(
            source=rag.VertexRagStore(
                rag_resources=[rag.RagResource(rag_corpus=rag_corpus.name)],
                rag_retrieval_config=cfg,
            )
        )
    )
    llm   = GenerativeModel(model_name=model, tools=[tool])
    return llm.generate_content(question).text.strip()

print(
    rag_ask("response", {cluster_content})
)