# basic function

In [10]:
!pip install python-dotenv google-genai

Collecting google-genai
  Downloading google_genai-1.57.0-py3-none-any.whl.metadata (53 kB)
Collecting tenacity<9.2.0,>=8.2.3 (from google-genai)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting websockets<15.1.0,>=13.0.0 (from google-genai)
  Downloading websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting distro<2,>=1.7.0 (from google-genai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Downloading google_genai-1.57.0-py3-none-any.whl (713 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m713.3/713.3 kB[0m [31m14.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading tenacity-9.1.2-py3-none-any.whl (28 kB)
Downloading websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (182 kB)
Installing collected packages: websockets, tenacity, 

In [7]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
assert api_key is not None

In [13]:
from google import genai
client = genai.Client()

In [14]:
SAMPLE_TEXT = """
The impact of artificial intelligence on scientific research has been profound. According to a study 
published in Nature (Smith et al., 2023, "AI-Driven Discovery in Materials Science", 
DOI: 10.1038/s41586-023-06221-2), machine learning models have accelerated materials discovery by 50x.

This trend is also reflected in social media discussions. A viral Twitter thread by @AIResearcher 
(https://twitter.com/AIResearcher/status/1234567890) garnered over 50,000 likes discussing how 
GPT-4 is being used in drug discovery pipelines.

The New York Times reported on this phenomenon in their article "The AI Revolution in Labs" 
(https://www.nytimes.com/2023/08/15/science/ai-laboratory-research.html), noting that major 
pharmaceutical companies are investing billions.

A Reddit discussion on r/MachineLearning (https://reddit.com/r/MachineLearning/comments/abc123) 
with 2,400 upvotes debated the reproducibility concerns raised in the original Nature paper.

For a technical deep-dive, the arXiv preprint "Transformer Architectures for Scientific Computing" 
(arXiv:2401.12345) provides implementation details. The official documentation at 
https://docs.example-ai-tool.com/guide has step-by-step tutorials.

Personal blog post by Dr. Jane Chen (https://janechenml.substack.com/p/ai-in-science-2024) 
offers a practitioner's perspective with 15,000 views.

The CDC guidelines (https://www.cdc.gov/ai-health/guidelines.html) now include recommendations 
for AI-assisted diagnostics.
"""

In [15]:
EXTRACTION_PROMPT = """
You are a reference extraction system. Analyze the following text and extract ALL references, citations, and sources mentioned.

For each reference found, provide:
1. raw_text: The exact text/citation as it appears
2. type: One of: academic_journal, academic_preprint, news_media, government_official, social_media, personal_blog, documentation, unknown
3. url: The URL if present (null if not)
4. doi: The DOI if present (null if not)
5. platform: The platform/source (e.g., "Nature", "Twitter", "arXiv", "Reddit", "NYTimes", etc.)
6. engagement_signals: Any mentioned metrics like views, likes, upvotes (null if not mentioned)

Return the results as a JSON array. Be thorough - extract every reference, even implicit ones.

TEXT TO ANALYZE:
---
{text}
---

Return ONLY valid JSON, no markdown formatting or explanation.
"""

In [16]:
import json

def extract_references(text: str) -> list[dict]:
    """Extract references from text using Gemini."""
    
    prompt = EXTRACTION_PROMPT.format(text=text)
    
    response = client.models.generate_content(
        model="gemini-2.0-flash-exp",
        contents=prompt
    )
    
    # Parse the JSON response
    try:
        response_text = response.text.strip()
        # Clean up markdown code blocks if present
        if response_text.startswith("```json"):
            response_text = response_text[7:]
        if response_text.startswith("```"):
            response_text = response_text[3:]
        if response_text.endswith("```"):
            response_text = response_text[:-3]
        
        return json.loads(response_text.strip())
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        print(f"Raw response: {response.text}")
        return []

In [17]:
references = extract_references(SAMPLE_TEXT)
print(f"Found {len(references)} references:\n")
print(json.dumps(references, indent=2))

Found 8 references:

[
  {
    "raw_text": "Nature (Smith et al., 2023, \"AI-Driven Discovery in Materials Science\", DOI: 10.1038/s41586-023-06221-2)",
    "type": "academic_journal",
    "url": null,
    "doi": "10.1038/s41586-023-06221-2",
    "platform": "Nature",
    "engagement_signals": null
  },
  {
    "raw_text": "@AIResearcher (https://twitter.com/AIResearcher/status/1234567890)",
    "type": "social_media",
    "url": "https://twitter.com/AIResearcher/status/1234567890",
    "doi": null,
    "platform": "Twitter",
    "engagement_signals": "50,000 likes"
  },
  {
    "raw_text": "The New York Times reported on this phenomenon in their article \"The AI Revolution in Labs\" (https://www.nytimes.com/2023/08/15/science/ai-laboratory-research.html)",
    "type": "news_media",
    "url": "https://www.nytimes.com/2023/08/15/science/ai-laboratory-research.html",
    "doi": null,
    "platform": "NYTimes",
    "engagement_signals": null
  },
  {
    "raw_text": "Reddit discussion on

reflections:
- data structure encourages hallucination - have not defined web search as a tool, so engagement_signals can be hallucinated.
- may want to start with a digital heuristic - EEAT, experience expertise authorativeness trustworthiness
- will want to log tokens in, tokens out, latency, to enable evaluation.
- may want to start an examples folder with real outputs we've evaluated (to enable quality ratings) - or find another example elsewhere.

# Iteration 2
Get logging, improve prompt, provide outputs for UX elements

In [25]:
import os
import json
import time
from dataclasses import dataclass, asdict
from datetime import datetime
from dotenv import load_dotenv
from google import genai

load_dotenv()

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

@dataclass
class LLMLog:
    timestamp: str
    model: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    latency_ms: int
    
    def to_dict(self):
        return asdict(self)

# Store logs for this session
llm_logs: list[LLMLog] = []

def generate_with_logging(prompt: str, model: str = "gemini-2.0-flash-exp") -> tuple[str, LLMLog]:
    """Generate content and log token usage + latency."""
    
    start_time = time.time()
    
    response = client.models.generate_content(
        model=model,
        contents=prompt
    )
    
    latency_ms = int((time.time() - start_time) * 1000)
    
    usage = response.usage_metadata
    log = LLMLog(
        timestamp=datetime.now().isoformat(),
        model=model,
        prompt_tokens=usage.prompt_token_count,
        completion_tokens=usage.candidates_token_count,
        total_tokens=usage.total_token_count,
        latency_ms=latency_ms
    )
    
    llm_logs.append(log)
    
    return response.text, log

print("✅ generate_with_logging defined")

print("✅ Setup complete")

✅ generate_with_logging defined
✅ Setup complete


In [20]:

with open('example_question.txt',mode='r') as file:
    EXAMPLE_QUESTION = file.read()

with open('example_answer.txt',mode='r') as file:
    EXAMPLE_ANSWER = file.read()

In [21]:
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum

class SourceType(str, Enum):
    ACADEMIC_JOURNAL = "academic_journal"
    ACADEMIC_PREPRINT = "academic_preprint"
    NEWS_MEDIA = "news_media"
    GOVERNMENT = "government"
    SOCIAL_MEDIA = "social_media"
    PERSONAL_BLOG = "personal_blog"
    DOCUMENTATION = "documentation"
    COMMERCIAL = "commercial"
    UNKNOWN = "unknown"

@dataclass
class ExtractedReference:
    """Stage 1 output - extracted from text only, no inference."""
    raw_text: str
    url: Optional[str] = None
    doi: Optional[str] = None
    arxiv_id: Optional[str] = None
    source_type: SourceType = SourceType.UNKNOWN
    platform_name: Optional[str] = None
    # Only what's explicitly in text
    stated_author: Optional[str] = None
    stated_date: Optional[str] = None
    stated_metrics: Optional[str] = None  # e.g., "2,400 upvotes" as string

@dataclass 
class CRAAPScore:
    """Stage 2 output - requires verification."""
    score: int  # 1-5
    evidence: str  # Why this score
    
@dataclass
class VerifiedReference:
    """Stage 2 output - after search/verification."""
    reference: ExtractedReference
    url_accessible: Optional[bool] = None
    currency: Optional[CRAAPScore] = None
    relevance: Optional[CRAAPScore] = None
    authority: Optional[CRAAPScore] = None
    accuracy: Optional[CRAAPScore] = None
    purpose: Optional[CRAAPScore] = None
    overall_score: Optional[float] = None
    verification_notes: str = ""

@dataclass
class AnalysisInput:
    """Full context for analysis."""
    question: str
    answer: str
    context: dict = field(default_factory=dict)  # e.g., {"user_location": "AU", "domain": "healthcare"}

print("✅ Data structures defined")

✅ Data structures defined


In [22]:
EXTRACTION_PROMPT_V2 = """
You are a reference extraction system. Extract ALL references from an AI-generated answer.

CONTEXT:
- User's question: {question}
- User's context: {context}

INSTRUCTIONS:
Extract every reference, citation, URL, or source mentioned in the answer.
Only extract information EXPLICITLY stated. Use null for anything not directly written.

For each reference provide:
- raw_text: Exact citation as written
- url: URL if present, null otherwise
- doi: DOI if present, null otherwise  
- arxiv_id: arXiv ID if present, null otherwise
- source_type: One of: academic_journal, academic_preprint, news_media, government, social_media, personal_blog, documentation, commercial, unknown
- platform_name: Specific platform/publication name
- stated_author: Author name if explicitly written, null otherwise
- stated_date: Publication date/year if written, null otherwise
- stated_metrics: Any engagement metrics as written (e.g., "50k views"), null otherwise

ANSWER TO ANALYZE:
---
{answer}
---

Return ONLY a JSON array. Do not hallucinate - null for anything not explicit.
"""

def extract_references_v2(input_data: AnalysisInput) -> tuple[list[dict], LLMLog]:
    """Extract references with question context."""
    
    prompt = EXTRACTION_PROMPT_V2.format(
        question=input_data.question,
        context=json.dumps(input_data.context),
        answer=input_data.answer
    )
    
    response_text, log = generate_with_logging(prompt)
    
    try:
        clean_text = response_text.strip()
        if clean_text.startswith("```json"):
            clean_text = clean_text[7:]
        if clean_text.startswith("```"):
            clean_text = clean_text[3:]
        if clean_text.endswith("```"):
            clean_text = clean_text[:-3]
        
        return json.loads(clean_text.strip()), log
    except json.JSONDecodeError as e:
        print(f"Error: {e}\nRaw: {response_text}")
        return [], log

print("✅ extract_references_v2 defined")

✅ extract_references_v2 defined


In [23]:
CRAAP_VERIFICATION_PROMPT = """
You are a source verification system evaluating references for a specific user question.

USER QUESTION: {question}
USER CONTEXT: {context}

REFERENCE TO VERIFY:
{reference_json}

Using web search, evaluate this reference on the CRAAP framework. For each dimension, provide:
- score: 1-5 (1=poor, 5=excellent)
- evidence: Specific reasons for the score

EVALUATION CRITERIA:

**Currency (1-5):**
- When was this published/updated?
- Is the information current enough for the topic?
- Are there more recent sources available?

**Relevance (1-5):**
- Does this source address the user's actual question?
- Is it geographically relevant? (User is in {user_location})
- Is it the right depth/audience level?

**Authority (1-5):**
- Who is the author? What are their credentials?
- Is the publisher/platform reputable for this topic?
- Is there institutional backing?

**Accuracy (1-5):**
- Can the claims be verified elsewhere?
- Does the source cite its own references?
- Is it peer-reviewed or edited?

**Purpose (1-5):**
- Why does this source exist? (inform/persuade/sell/entertain)
- Is there obvious bias or conflict of interest?
- Is it trying to sell something?

Also check:
- url_accessible: Can the URL be reached? (true/false/null if no URL)

Return JSON:
{{
  "url_accessible": true/false/null,
  "currency": {{"score": 1-5, "evidence": "..."}},
  "relevance": {{"score": 1-5, "evidence": "..."}},
  "authority": {{"score": 1-5, "evidence": "..."}},
  "accuracy": {{"score": 1-5, "evidence": "..."}},
  "purpose": {{"score": 1-5, "evidence": "..."}},
  "overall_score": 1.0-5.0,
  "verification_notes": "Key findings from search"
}}

Return ONLY valid JSON.
"""

def verify_reference_with_search(
    reference: dict, 
    question: str, 
    context: dict,
    model: str = "gemini-2.0-flash-exp"
) -> tuple[dict, LLMLog]:
    """Verify a single reference using Gemini with Google Search."""
    
    prompt = CRAAP_VERIFICATION_PROMPT.format(
        question=question,
        context=json.dumps(context),
        reference_json=json.dumps(reference, indent=2),
        user_location=context.get("user_location", "unknown")
    )
    
    # Enable Google Search grounding
    from google.genai import types
    
    start_time = time.time()
    
    response = client.models.generate_content(
        model=model,
        contents=prompt,
        config=types.GenerateContentConfig(
            tools=[types.Tool(google_search=types.GoogleSearch())]
        )
    )
    
    latency_ms = int((time.time() - start_time) * 1000)
    
    usage = response.usage_metadata
    log = LLMLog(
        timestamp=datetime.now().isoformat(),
        model=model,
        prompt_tokens=usage.prompt_token_count,
        completion_tokens=usage.candidates_token_count,
        total_tokens=usage.total_token_count,
        latency_ms=latency_ms
    )
    llm_logs.append(log)
    
    try:
        clean_text = response.text.strip()
        if clean_text.startswith("```json"):
            clean_text = clean_text[7:]
        if clean_text.startswith("```"):
            clean_text = clean_text[3:]
        if clean_text.endswith("```"):
            clean_text = clean_text[:-3]
        
        return json.loads(clean_text.strip()), log
    except json.JSONDecodeError as e:
        print(f"Error: {e}\nRaw: {response.text}")
        return {}, log

print("✅ verify_reference_with_search defined")

✅ verify_reference_with_search defined


In [26]:
# Replace with your actual example
test_input = AnalysisInput(
    question=EXAMPLE_QUESTION,
    answer=EXAMPLE_ANSWER,
    context={
        "user_location": "AU",
        "domain": "AI consulting",  # e.g., "healthcare", "finance", "technology"
    }
)

# Stage 1: Extract
print("=== STAGE 1: EXTRACTION ===")
refs, extract_log = extract_references_v2(test_input)
print(f"Tokens: {extract_log.total_tokens} | Latency: {extract_log.latency_ms}ms")
print(f"Found {len(refs)} references")
print(json.dumps(refs, indent=2))

=== STAGE 1: EXTRACTION ===
Tokens: 5678 | Latency: 10661ms
Found 16 references
[
  {
    "raw_text": "iTnews",
    "url": "https://www.itnews.com.au/feature/how-australian-businesses-can-overcome-the-struggle-to-move-ai-from-hype-to-reality-618510",
    "doi": null,
    "arxiv_id": null,
    "source_type": "news_media",
    "platform_name": "iTnews",
    "stated_author": null,
    "stated_date": null,
    "stated_metrics": null
  },
  {
    "raw_text": "Substack",
    "url": "https://futureofprospecting.substack.com/p/the-ai-threat-that-could-break-salesforce",
    "doi": null,
    "arxiv_id": null,
    "source_type": "personal_blog",
    "platform_name": "Substack",
    "stated_author": null,
    "stated_date": null,
    "stated_metrics": null
  },
  {
    "raw_text": "The Conversation",
    "url": "https://theconversation.com/australian-businesses-have-actually-been-slow-to-adopt-ai-survey-finds-269812",
    "doi": null,
    "arxiv_id": null,
    "source_type": "news_media",
    "pl

In [28]:
print("=== STAGE 2: VERIFICATION ===")

verified_refs = []
for i, ref in enumerate(refs[:3]):  # Limit to first 3 for testing
    print(f"\nVerifying {i+1}/{min(3, len(refs))}: {ref.get('platform_name', 'unknown')}")
    
    verification, verify_log = verify_reference_with_search(
        ref, 
        test_input.question,
        test_input.context
    )
    
    print(f"  Tokens: {verify_log.total_tokens} | Latency: {verify_log.latency_ms}ms")
    
    if verification:
        overall = verification.get('overall_score', 'N/A')
        print(f"  Overall CRAAP Score: {overall}/5")
        print(f"  Notes: {verification.get('verification_notes', '')[:100]}...")
    
    verified_refs.append({
        "reference": ref,
        "verification": verification
    })

print("\n=== FULL RESULTS ===")
print(json.dumps(verified_refs, indent=2))

=== STAGE 2: VERIFICATION ===

Verifying 1/3: iTnews
  Tokens: 1109 | Latency: 4178ms
  Overall CRAAP Score: 3.6/5
  Notes: The iTnews article seems generally relevant and authoritative, but the lack of a specific publicatio...

Verifying 2/3: Substack
  Tokens: 1124 | Latency: 4113ms
  Overall CRAAP Score: 2.2/5
  Notes: This Substack article discussing the AI threat to Salesforce is of limited value for researching the...

Verifying 3/3: The Conversation
  Tokens: 1018 | Latency: 3556ms
  Overall CRAAP Score: 4.0/5
  Notes: Key findings from search indicate that Australian businesses have been slow to adopt AI. This articl...

=== FULL RESULTS ===
[
  {
    "reference": {
      "raw_text": "iTnews",
      "url": "https://www.itnews.com.au/feature/how-australian-businesses-can-overcome-the-struggle-to-move-ai-from-hype-to-reality-618510",
      "doi": null,
      "arxiv_id": null,
      "source_type": "news_media",
      "platform_name": "iTnews",
      "stated_author": null,
      "s

quick check of logs

In [29]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m49.1 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m
[?25hDownloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m57.1 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Installing collected packages: pytz, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m

In [30]:
import pandas as pd
df_logs = pd.DataFrame([log.to_dict() for log in llm_logs])
df_logs

Unnamed: 0,timestamp,model,prompt_tokens,completion_tokens,total_tokens,latency_ms
0,2026-01-08T01:04:51.496310,gemini-2.0-flash-exp,3746,1932,5678,10661
1,2026-01-08T01:05:39.939271,gemini-2.0-flash-exp,651,458,1109,4178
2,2026-01-08T01:05:44.053101,gemini-2.0-flash-exp,627,497,1124,4113
3,2026-01-08T01:05:47.609839,gemini-2.0-flash-exp,636,382,1018,3556
