# Scoring with RAG

In [1]:
import os
import json
import ocrmypdf
import re
import json
import chromadb  # Vector Database
from tqdm import tqdm
import logging
import requests
import time
import pandas as pd
import numpy as np
import torch 
from torch import nn
from torch.optim import AdamW  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd 
from io import BytesIO
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, pipeline, BertTokenizer, BertModel, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy import displacy
torch.set_default_device("cpu")
import random
import json
# from google import genai
# from openai import OpenAI
import openai
from fuzzywuzzy import fuzz 
google_api_key = "AIzaSyCutzQsZEOJUQgHwcvjtPNiLFbgyxOfmko"
API_KEY = "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"

  from .autonotebook import tqdm as notebook_tqdm


# RAG testing 

### Vector database
When you store documents in ChromaDB using collection.add(), it:

1. Generates vector embeddings for your text (if you haven't provided them).
2. Stores the document along with its embedding in the vector database.
3. Matches queries based on similarity search (cosine similarity by default).


In [2]:
df = pd.read_csv("../files/labeled_pdfs_1003.csv")

In [3]:
client = chromadb.PersistentClient(path="./chromadb_1003")  # Stores DB in ./chroma_db
collection = client.get_or_create_collection(name="dsa3101")
logging.basicConfig(level=logging.WARNING)

for index, row in tqdm(df.iterrows(), total=len(df), desc="Adding documents", unit="document", leave=True, ncols=100):
    doc_text = row["esg_text"]  
    doc_company = row["company"]  
    doc_year = row["year"]  
    doc_industry = row["industry"]
    doc_id = f"doc_{index}"  

    collection.add(
        ids=[doc_id], 
        documents=[doc_text],  
        metadatas=[{"company": doc_company, "year": doc_year}] 
    )

Adding documents: 100%|███████████████████████████████| 63903/63903 [2:04:47<00:00,  8.53document/s]


In [41]:
print(collection.count())

63903


## Testing the Retrieval
Method: Semantic similarity to compare embeddings of the query to the sentences, and retrieve the top sentences with the highest similarity scores.
Limitations: Different words may have different meanings under different contexts. 

In [5]:
client = chromadb.PersistentClient(path="./chromadb_1003")
collection = client.get_or_create_collection(name="dsa3101")

In [7]:
query = "Retrieve percentage of reduction in Greenhouse gas emissions during the reporting year in the company. This can be in a) Total reduction, b) Scope 1 reduction and c) Scope 2 reduction"
results = collection.query(
    query_texts=[query],
    where={"company": "Apple"},
    n_results=5
)

In [8]:
results

{'ids': [['doc_51558', 'doc_51413', 'doc_51407', 'doc_51406', 'doc_51420']],
 'embeddings': None,
 'documents': [['—> Continue reading on page 13  Reduced overall  emissions by 40%  In fiscal year 2021, our environmental  initiatives avoided over 23 million metric  tons of emissions across all scopes, and  we reduced our carbon footprint by  40 percent compared with fiscal year  2015.',
   'Without the methodology  change, these emissions would have increased by 14 percent, which reflects  the growth in our business.',
   'In fiscal year 2017, we started calculating scope 3 emissions not listed in  this table.',
   "Beginning in FY2021, we're accounting for scope 2 emissions from the  purchase of district heating, chilled water, and steam.",
   'When using the  same level of data granularity and model as 2021, our product use carbon  emissions in 2021 would have been about 2.5 percent lower.']],
 'uris': None,
 'data': None,
 'metadatas': [[{'company': 'Apple', 'year': 2022.0},
   {'co

## Testing the Generator
Use DeepSeek API 

In [10]:
# client = OpenAI(
#     base_url="https://openrouter.ai/api/v1",
#     api_key=API_KEY
# )

client = openai.ChatCompletion 
reranker_model_name = "BAAI/bge-reranker-base"
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name)
reranker_model.eval()  

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [11]:
def generate_response(query, reranked_docs):
    """Retrieve context from ChromaDB and generate an answer using DeepSeek."""
    
    context = "\n\n".join(reranked_docs)

    prompt = f"""You are an expert in ESG analysis. Please reason through step by step and then provide the final answer to the query. 
    Please verify your answer against the context provided, and rewrite the answer if inconsistent. Below is a question and relevant retrieved documents.
    
    Question: {query}

    Context:
    {context}

    Please provide a factually accurate response. If a fact is used from a document, include '(ChunkID)' next to it.
    """

    retries = 3
    while retries > 0:
        try:
            completion = client.chat.completions.create(
                model="deepseek/deepseek-r1-zero:free",
                messages=[
                    {"role": "system", "content": "You are an expert in ESG analysis. Answer factually and ensure consistency with the provided context, especially focusing on environemntal, sustainability and governance principles."},
                    {"role": "user", "content": prompt}
                ]
            )

            if completion and completion.choices and completion.choices[0].message:
                return completion.choices[0].message.content  # Return model's response

            print("Warning: Empty response. Retrying...")
            retries -= 1
            time.sleep(5)

        except Exception as e:
            print(f"Error: {e}. Retrying...")
            retries -= 1
            time.sleep(5)

    return "Error: Unable to generate a response."

def rerank_documents(query, retrieved_docs):
    """
    Reranks the retrieved documents based on relevance scores using the BAAI/bge-reranker-base model.

    Args:
        query (str): The search query.
        retrieved_docs (list): A list of retrieved document texts.

    Returns:
        list: The reranked documents sorted by relevance.
    """
    if not retrieved_docs:
        return []

    # Tokenize inputs
    inputs = reranker_tokenizer(
        [query] + retrieved_docs,  
        padding=True, truncation=True, return_tensors="pt"
    )

    # Compute relevance scores
    with torch.no_grad():
        scores = reranker_model(**inputs).logits.squeeze().tolist()

    # Sort retrieved docs by relevance score (descending order)
    reranked_docs = [doc for _, doc in sorted(zip(scores[1:], retrieved_docs), reverse=True)]

    return reranked_docs

# Start of full RAG code with json

In [None]:
with open("esg_metrics.json", "r") as file:
    esg_metrics = json.load(file)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chromadb_1003")
collection = client.get_or_create_collection(name="dsa3101")

# Initialize empty DataFrame
df_columns = ["Company"] + list(esg_metrics.keys())  # One column per ESG metric
df_metrics = pd.DataFrame(columns=df_columns)

# Function to retrieve relevant ESG text using ChromaDB
def retrieve_esg_text(company, query):
    collection = chroma_client.get_collection(name="esgtext")
    results = collection.query(query_texts=[query], n_results=5)
    return results

# Function to rerank retrieved documents
def get_reranked_docs(query, results):
    retrieved_docs = [doc for doc in results["documents"][0]]
    reranked_docs = rerank_documents(query, retrieved_docs)
    return reranked_docs
    
# Function to extract metric values using DeepSeek  
def extract_values(query, retrieved_text):
    reranked_docs = get_reranked_docs(query, results)
    response = generate_response(query, reranked_docs)
    return response["text"]  ######## Adjust based on DeepSeek output format

# Function to compute the score based on thresholds
def compute_linear_score(extracted_values, thresholds):
    ####
    pass

# List of companies
companies = ["Pfizer", "Apple", "Datadog"]  # Replace with actual company list

# Process each company
for company in companies:
    row_data = {"Company": company}

    for metric, details in esg_metrics.items():
        query = details["query"]
        scoring_thresholds = details["scoring_thresholds"]

        # Retrieve ESG text using ChromaDB
        retrieved_text = retrieve_esg_text(company, query)

        # Extract values using DeepSeek
        extracted_values = extract_values(query, retrieved_text)

        # Compute score
        score = compute_score(extracted_values, scoring_thresholds)

        # Store results
        row_data[metric] = {"extracted_values": extracted_values, "score": score}

    # Append to DataFrame
    df_metrics = df_metricsdf.append(row_data, ignore_index=True)

# Save DataFrame to CSV
df_metrics.to_csv("company_esg_scores.csv", index=False)


# Post processing

Checking for hallucination, irrelevance, bias 
In this assignment, I felt that biasness wasn't really a metric required, I think it would be good to add biasness if i extracted data from third party sources grading the company esg scores. I can then compare the third-party metrics and scoring to each company's esg reports, and check if there is biasness in terms of their ratings, towards a particular, company or industry, etc. Therefore, I just added the metric for future reference, but it is not required in this assignment.

### Hallucination detection (Faithfullness)

In [None]:
def normalize_text(text):
    """Normalize text by converting to lowercase and removing punctuation."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def fuzzy_match(sentence, doc, threshold=80):
    """Check if sentence has a fuzzy match in the document."""
    return fuzz.partial_ratio(normalize_text(sentence), normalize_text(doc)) >= threshold

def verify_facts(response, reranked_docs, fuzzy_threshold=80):
    """Detect hallucinations by checking if sentences exist in retrieved docs using fuzzy matching."""
    missing_facts = []
    
    # Split response into sentences and check if they appear in any of the documents
    for sent in response.split(". "):
        found = any(fuzzy_match(sent, doc, fuzzy_threshold) for doc in reranked_docs)
        if not found:
            missing_facts.append(sent)

    if missing_facts:
        print("Warning: Some statements are not found in the retrieved context:")
        for fact in missing_facts:
            print(f"- {fact}")
    
    return 1 - len(missing_facts) / len(response.split(". "))  # Faithfulness Score

faithfulness_score = verify_facts(response, reranked_documents)
print(f"Faithfulness Score: {faithfulness_score}")

## Irrelevance Check

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def check_relevance(query, response, threshold=0.6):
    """
    Check the relevance of the response to the query using semantic similarity.
    """
    query_embedding = model.encode([query])
    response_embedding = model.encode([response])

    similarity = cosine_similarity(query_embedding, response_embedding)[0][0]

    if similarity >= threshold:
        return True, similarity  
    else:
        return False, similarity  

In [None]:
is_relevant = check_relevance(query, response)
print(f"Is the response relevant? {is_relevant}")

In [None]:
model_name = "BAAI/bge-reranker-base" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def check_bias(text):
    """
    Check for potential bias in the text using a pretrained model.
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    # Assuming binary classification (0 = no bias, 1 = biased)
    predicted_class = torch.argmax(logits, dim=1).item()
    
    return predicted_class == 1  # 1 indicates bias (this depends on the model's labeling)

# Example Usage
response = "Pfizer has been focusing on improving diversity in their clinical trials and sharing their insights with others as part of their diversity and inclusion initiatives in 2022."

is_biased = check_bias(response)
print(f"Is the response biased? {is_biased}")

# Evaluation

## Retriever Evaluation
Typical metrics: RecalL@k, Precision @k, Mean Reciprocal Rank, Mean Average Precision

### Cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_retrieval_relevance(query, reranked_docs):
    """Compute semantic similarity between query and retrieved docs."""
    corpus = [query] + reranked_docs
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_scores = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:])
    return similarity_scores.mean()  # Average similarity

In [None]:
compute_retrieval_relevance(query, reranked_documents)

## Generator Evaluation 
Typical metrics: ROUGE, BLEU, BERTScore, domain-specific or task-specific metrics

### BLEU Score (Text similarity)

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def compute_bleu_score(reference, generated_response):
    """Compare generated response against reference text using BLEU score."""
    reference_tokens = reference.lower().split()
    generated_tokens = generated_response.lower().split()
    return sentence_bleu([reference_tokens], generated_tokens)

In [None]:
compute_bleu_score(query, response)

### Retrieval score (relevance)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_retrieval_relevance(reranked_docs, response):
    """Calculate how relevant the response is to the retrieved documents."""
    corpus = reranked_docs + [response]  # Combine all docs and response
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_matrix = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    return similarity_matrix.mean()  # Average similarity score


In [None]:
compute_retrieval_relevance(reranked_docs, response)

### Judge LM 

In [None]:
client = genai.Client(api_key=google_api_key)
reranked_docs_str = "\n".join(reranked_documents)

gemini_eval = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=f"""
                Evaluate how well the response answers the query, giving an explanation of how it answers the question, and whether the response is factually correct based on the context provided.
                I have added the query, response and retrieved context below.
                
                Query: 
                {query}
                
                Response:
                {response}
                
                Retrieved Context:
                {reranked_docs_str}
                
                Give a score from 0 to 10, and a detailed explanation on the score, where:
                - 10 = Perfectly accurate
                - 0 = Completely incorrect
                """
)

print(gemini_eval.text)

## LLM Hallucination Detection 1 (Log Probability + Fuzzy Matching)

### What is Log Probability?
Log probability (log-prob) measures how likely the model thinks a given word or phrase should appear in a sentence.  
A higher log probability means the model is more confident in its response.  
A lower log probability indicates the response might be hallucinated, meaning it is unlikely based on what the model has learned.  

### How does Log Probability improve Accuracy?
Fuzzy matching only checks if words exist in retrieved ESG documents.  
It cannot tell if the sentence is logically correct or if the model is confident about its answer.  
Log probability helps by measuring how confident the model is in generating the response.  

### Fixes:
1. generate_response function to include Log Probability  
2. Include a function (detect_hallucinations) which detects hallucinations by combining log probability and fuzzy matching  
3. Edited a function (extract_value) to include variables avg_log_prob and hallucination flag  
4. FIX: Use `pd.concat` instead of `.append()`, as `.append()` is deprecated when processing companies

In [None]:
with open("../files/Scoring_revised_120325.json", "r") as file:
    esg_metrics = json.load(file)

########## Modified Completion Function to Include Log Probability ##########
def generate_response(query, reranked_docs):
    """Retrieve context from ChromaDB and generate an answer using DeepSeek, also computing log probability."""
    
    context = "\n\n".join(reranked_docs)

    prompt = f"""You are an expert in ESG analysis. Please reason through step by step and then provide the final answer to the query. 
    Please verify your answer against the context provided, and rewrite the answer if inconsistent. Below is a question and relevant retrieved documents.
    
    Question: {query}

    Context:
    {context}

    Please provide a factually accurate response. If a fact is used from a document, include '(ChunkID)' next to it.
    """

    retries = 3
    while retries > 0:
        try:
            completion = client.ChatCompletion.create(
                model="deepseek/deepseek-r1-zero:free",
                messages=[
                    {"role": "system", "content": "You are an expert in ESG analysis. Answer factually and ensure consistency with the provided context, especially focusing on environmental, sustainability, and governance principles."},
                    {"role": "user", "content": prompt}
                ],
                logprobs=True  # Enables log probability computation
            )

            if completion and completion.choices and completion.choices[0].message:
                response_text = completion.choices[0].message.content  
                log_probs = completion.choices[0].logprobs.token_logprobs  # Extract log probabilities

                avg_log_prob = sum(log_probs) / len(log_probs) if log_probs else -float('inf')  # Compute average log probability
                return response_text, avg_log_prob  # Return both response and log probability

            print("Warning: Empty response. Retrying...")
            retries -= 1
            time.sleep(5)

        except Exception as e:
            print(f"Error: {e}. Retrying...")
            retries -= 1
            time.sleep(5)

    return "Error: Unable to generate a response.", -float('inf')  # Return a default low probability if failed

def rerank_documents(query, retrieved_docs):
    """
    Reranks the retrieved documents based on relevance scores using the BAAI/bge-reranker-base model.

    Args:
        query (str): The search query.
        retrieved_docs (list): A list of retrieved document texts.

    Returns:
        list: The reranked documents sorted by relevance.
    """
    if not retrieved_docs:
        return []

    # Tokenize inputs
    inputs = reranker_tokenizer(
        [query] + retrieved_docs,  
        padding=True, truncation=True, return_tensors="pt"
    )

    # Compute relevance scores
    with torch.no_grad():
        scores = reranker_model(**inputs).logits.squeeze().tolist()

    # Sort retrieved docs by relevance score (descending order)
    reranked_docs = [doc for _, doc in sorted(zip(scores[1:], retrieved_docs), reverse=True)]

    return reranked_docs

########## Improved Hallucination Detection ##########
def detect_hallucination(response, reranked_docs, avg_log_prob, log_prob_threshold=-2.0, fuzzy_threshold=80):
    """
    Detect hallucinations by combining log probability and fuzzy matching.
    
    Args:
        response (str): The generated response from the LLM.
        reranked_docs (list): The list of relevant ESG documents.
        avg_log_prob (float): The average log probability of the response.
        log_prob_threshold (float): Minimum log probability threshold for high-confidence outputs.
        fuzzy_threshold (int): Minimum similarity percentage for fuzzy matching.
    
    Returns:
        (bool, float): Hallucination flag (True if hallucinated), and Faithfulness Score.
    """

    # Compute Faithfulness Score using Fuzzy Matching
    faithfulness_score = verify_facts(response, reranked_docs, fuzzy_threshold)

    # Apply Log Probability Threshold
    log_prob_flag = avg_log_prob < log_prob_threshold  # True if log probability is too low

    # Determine if the response is hallucinated
    hallucination_flag = log_prob_flag or (faithfulness_score < 0.8)  # Hallucinated if either check fails

    if hallucination_flag:
        print(f"Warning: Possible hallucination detected! Log-Prob: {avg_log_prob:.2f}, Faithfulness Score: {faithfulness_score:.2f}")

    return hallucination_flag, faithfulness_score

########## Initialize ChromaDB Client Properly ##########
chroma_client = chromadb.PersistentClient(path="./chromadb_1003")
collection = chroma_client.get_or_create_collection(name="dsa3101")  # FIX: Use chroma_client instead of client

# Convert the list of dictionaries into a single dictionary
esg_metrics_dict = {list(entry.keys())[0]: entry[list(entry.keys())[0]] for entry in esg_metrics}

########## Initialize DataFrame ##########
df_columns = ["Company"] + list(esg_metrics_dict.keys())  
df_metrics = pd.DataFrame(columns = df_columns)


########## ESG Data Retrieval ##########
def retrieve_esg_text(company, query):
    collection = chroma_client.get_or_create_collection(name="dsa3101")
    results = collection.query(query_texts=[query], n_results=5)
    return results

def get_reranked_docs(query, results):
    retrieved_docs = [doc for doc in results["documents"][0]]
    reranked_docs = rerank_documents(query, retrieved_docs)
    return reranked_docs

########## Edited the function to include variables avg_log_prob and hallucination flag ##########
def extract_values(query, retrieved_text):
    reranked_docs = get_reranked_docs(query, retrieved_text)
    response_text, avg_log_prob = generate_response(query, reranked_docs)

    # Check for hallucinations using log probability + fuzzy matching
    hallucination_flag, faithfulness_score = detect_hallucination(response_text, reranked_docs, avg_log_prob)

    return {
        "text": response_text, 
        "log_prob": avg_log_prob, 
        "hallucination_flag": hallucination_flag,
        "faithfulness_score": faithfulness_score
    }

# Function to compute the score based on thresholds
def compute_score(extracted_values, thresholds):
    ####
    pass

def normalize_text(text):
    """Normalize text by converting to lowercase and removing punctuation."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def fuzzy_match(sentence, doc, threshold=80):
    """Check if sentence has a fuzzy match in the document."""
    return fuzz.partial_ratio(normalize_text(sentence), normalize_text(doc)) >= threshold

def verify_facts(response, reranked_docs, fuzzy_threshold=80):
    """Detect hallucinations by checking if sentences exist in retrieved docs using fuzzy matching."""
    missing_facts = []
    
    # Split response into sentences and check if they appear in any of the documents
    for sent in response.split(". "):
        found = any(fuzzy_match(sent, doc, fuzzy_threshold) for doc in reranked_docs)
        if not found:
            missing_facts.append(sent)

    if missing_facts:
        print("Warning: Some statements are not found in the retrieved context:")
        for fact in missing_facts:
            print(f"- {fact}")
    
    return 1 - len(missing_facts) / len(response.split(". "))  # Faithfulness Score

# List of companies
companies = ["Pfizer", "Apple", "Datadog"]  # Replace with actual company list

########## Process each company (Added in additional variables) ##########
for company in companies:
    row_data = {"Company": company}

    for metric, details in esg_metrics_dict.items():
        query = details["query"]
        scoring_thresholds = details["scoring_query"]

        # Retrieve ESG text using ChromaDB
        retrieved_text = retrieve_esg_text(company, query)

        # Extract values using DeepSeek, now with hallucination detection
        extracted_data = extract_values(query, retrieved_text)

        # Compute score
        score = compute_score(extracted_data["text"], scoring_thresholds)

        # Store results
        row_data[metric] = {
            "extracted_values": extracted_data["text"], 
            "score": score, 
            "hallucination_flag": extracted_data["hallucination_flag"],
            "log_prob": extracted_data["log_prob"],
            "faithfulness_score": extracted_data["faithfulness_score"]
        }

    # FIX: Use `pd.concat` instead of `.append()`, as `.append()` is deprecated
    df_metrics = pd.concat([df_metrics, pd.DataFrame([row_data])], ignore_index=True)

# Save DataFrame to CSV
df_metrics.to_csv("company_esg_scores.csv", index=False)

## LLM Hallucination Detection 2 (Self-Check GPT Hallucination Detection)
The Self-Check GPT Hallucination Detection method is an advanced approach to identifying hallucinations in LLM-generated responses.  
This method uses another LLM (GPT) to verify the factual accuracy of the generated response against retrieved documents.

In [47]:
########## Function to Check for Hallucination Using GPT ##########
def check_hallucination(query, response_text, reranked_docs):
    """Uses GPT to verify the accuracy of the generated response against retrieved documents."""

    context = "\n\n".join(reranked_docs)

    validation_prompt = f"""
    You are an ESG data validation expert. Your task is to critically analyze the AI-generated response and determine whether it is **factually consistent** with the provided reference context.

    ### **Evaluation Criteria**:
    - The AI-generated response should only contain facts present in the provided context.
    - If the response introduces new information **not found in the context, flag it as a hallucination.
    - If the response distorts or misinterprets facts, flag it as misleading.
    - If the response lacks key details but remains mostly correct, classify it as **partially accurate**.
    - Assign a **confidence score** based on the accuracy of the response.

    ### Query (User's Question):
    {query}

    ### Context (Extracted ESG Documents):
    {context}

    ### AI-Generated Response:
    {response_text}

    ### Validation Instructions:
    1. Fact Check: Compare each claim in the AI-generated response with the provided context.
    2. Correct Errors: If any information is incorrect or missing, provide a revised version.
    3. Score Accuracy:
       - 1.0 → Fully accurate and supported by context.
       - 0.5 - 1.0 → Partially accurate (contains minor errors, omissions, or ambiguities).
       - 0.0 - 0.5 → Likely hallucinated (contains information not found in the context or misleading claims).

    ### Expected JSON Output Format:
    {{
        "corrected_response": "<Your revised response ensuring full factual accuracy>",
        "confidence_score": <A float value between 0.0 and 1.0 indicating confidence in factual correctness>
    }}
    """

    try:
        validation_completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an AI designed to validate factual accuracy in ESG reports."},
                {"role": "user", "content": validation_prompt}
            ]
        )

        validation_result = json.loads(validation_completion["choices"][0].message.content)
        return validation_result["corrected_response"], validation_result["confidence_score"]

    except Exception as e:
        print(f"Error during self-check GPT validation: {e}")
        return response_text, 0.0  # Assume hallucinated if an error occurs

## LLM Hallucination Detection Method 3 (RAGAS - Retrieval Augmented Generation Assessment Suite)

### What is RAGAS?
RAGAS is an LLM-powered evaluation framework designed for RAG (Retrieval-Augmented Generation) pipelines.  
It provides quantitative scores to assess whether an LLM-generated answer is:  
1. Faithful – How well the generated response aligns with retrieved context.  
2. Relevant – How well the response answers the user’s query based on semantic similarity.  

### How does it detect hallucinations?
It helps detect hallucinations by measuring:  
- If the response is grounded in retrieved documents (Faithfulness)  
- If the response answers the actual query (answer relevancy)

In [None]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy

def detect_hallucination_ragas(response, query, reranked_docs):
    """
    Detect hallucinations using RAGAS by evaluating:
    - Faithfulness: How well the response aligns with retrieved documents.
    - Answer Relevancy: How well the response answers the user query.

    Args:
        response (str): The generated response from LLM.
        query (str): The original user query.
        reranked_docs (list): Retrieved documents from ChromaDB.

    Returns:
        dict: A dictionary containing faithfulness score, relevancy score, and hallucination flag.
    """
    # Compute RAGAS Faithfulness Score
    faithfulness_score = faithfulness.score(response, reranked_docs)
    
    # Compute RAGAS Answer Relevancy Score
    relevancy_score = answer_relevancy.score(response, query)

    # Determine Hallucination Flag
    hallucination_flag = faithfulness_score < 0.8  # Adjust threshold as needed

    if hallucination_flag:
        print(f"⚠️ Warning: Possible hallucination detected! Faithfulness Score: {faithfulness_score:.2f}, Relevancy Score: {relevancy_score:.2f}")

    return {
        "faithfulness_score": faithfulness_score,
        "relevancy_score": relevancy_score,
        "hallucination_flag": hallucination_flag
    }

########## Modified extract_values() to use RAGAS ##########
def extract_values(query, retrieved_text):
    reranked_docs = get_reranked_docs(query, retrieved_text)
    response = generate_response(query, reranked_docs)

    # Detect hallucination using RAGAS instead of log probability & fuzzy matching
    hallucination_results = detect_hallucination_ragas(response, query, reranked_docs)

    return {
        "text": response,
        "faithfulness_score": hallucination_results["faithfulness_score"],
        "relevancy_score": hallucination_results["relevancy_score"],
        "hallucination_flag": hallucination_results["hallucination_flag"]
    }

for company in companies:
    row_data = {"Company": company}

    for metric, details in esg_metrics.items():
        query = details["query"]
        scoring_thresholds = details["scoring_thresholds"]

        # Retrieve ESG text using ChromaDB
        retrieved_text = retrieve_esg_text(company, query)

        # Extract values using DeepSeek & detect hallucination using RAGAS
        extracted_data = extract_values(query, retrieved_text)

        # Compute score
        score = compute_score(extracted_data["text"], scoring_thresholds)

        # Store results including RAGAS scores
        row_data[metric] = {
            "extracted_values": extracted_data["text"], 
            "score": score, 
            "hallucination_flag": extracted_data["hallucination_flag"],
            "faithfulness_score": extracted_data["faithfulness_score"],
            "relevancy_score": extracted_data["relevancy_score"]
        }

    # Append to DataFrame
    df_metrics = pd.concat([df_metrics, pd.DataFrame([row_data])], ignore_index=True)

# Save DataFrame to CSV
df_metrics.to_csv("company_esg_scores.csv", index=False)


# TEST


In [None]:
import json
import openai
import chromadb
import pandas as pd
import time
import re
from fuzzywuzzy import fuzz

with open("../files/Scoring_revised_120325.json", "r") as file:
    esg_metrics = json.load(file)


########## Function to Generate Response ##########
def generate_response(query, reranked_docs):
    """Retrieve context from ChromaDB and generate an answer using DeepSeek."""
    
    context = "\n\n".join(reranked_docs)

    prompt = f"""You are an expert in ESG analysis. Please reason through step by step and then provide the final answer to the query. 
    Please verify your answer against the context provided, and rewrite the answer if inconsistent. Below is a question and relevant retrieved documents.
    
    Question: {query}

    Context:
    {context}

    Please provide a factually accurate response. If a fact is used from a document, include '(ChunkID)' next to it.
    """

    retries = 3
    while retries > 0:
        try:
            completion = client.ChatCompletion.create(
                model="deepseek/deepseek-r1-zero:free",
                messages=[
                    {"role": "system", "content": "You are an expert in ESG analysis. Answer factually and ensure consistency with the provided context, especially focusing on environmental, sustainability, and governance principles."},
                    {"role": "user", "content": prompt}
                ]
            )

            if completion and completion.choices and completion.choices[0].message:
                response_text = completion.choices[0].message.content
                return response_text  

            print("Warning: Empty response. Retrying...")
            retries -= 1
            time.sleep(5)

        except Exception as e:
            print(f"Error: {e}. Retrying...")
            retries -= 1
            time.sleep(5)

    return "Error: Unable to generate a response."

########## Function to Check for Hallucination Using GPT ##########
def check_hallucination(query, response_text, reranked_docs):
    """Uses GPT to verify the accuracy of the generated response against retrieved documents."""

    context = "\n\n".join(reranked_docs)

    validation_prompt = f"""You are an ESG data validation expert. Your task is to verify the accuracy of the following AI-generated response against the provided context.

    - If the response is factually correct based on the context, return a confidence score of 1.0.
    - If the response contains minor inaccuracies, return a confidence score between 0.5 and 1.0.
    - If the response contains hallucinations (information not found in the context), return a confidence score between 0.0 and 0.5.

    ### Query:
    {query}

    ### Context (Extracted from ESG Documents):
    {context}

    ### AI-Generated Response:
    {response_text}

    ### Instruction:
    - Analyze whether the response is consistent with the context.
    - If the response is incorrect, provide a corrected version.
    - Provide a confidence score from 0.0 to 1.0 indicating how well the response aligns with the provided context.

    **Output format (JSON):**
    {{
        "corrected_response": "<your corrected response>",
        "confidence_score": <confidence_score>
    }}
    """
    retries = 3
    while retries > 0:
        try:
            validation_completion = client.ChatCompletion.create(
                model="deepseek/deepseek-r1-zero:free",
                messages=[
                    {"role": "system", "content": "You are an AI designed to validate factual accuracy in ESG reports."},
                    {"role": "user", "content": validation_prompt}
                ]
            )

            validation_result = json.loads(validation_completion["choices"][0]["message"]["content"])
            return validation_result["corrected_response"], validation_result["confidence_score"]

        except Exception as e:
            print(f"Error during self-check GPT validation: {e}")
            retries -= 1
            return response_text, 0.0  # Assume hallucinated if an error occurs

########## Initialize ChromaDB Client ##########
chroma_client = chromadb.PersistentClient(path="./chromadb_1003")


########## ESG Data Retrieval ##########
def retrieve_esg_text(company, query):
    collection = chroma_client.get_collection(name="dsa3101")  
    results = collection.query(query_texts=[query], n_results=5)
    return results

########## Extract ESG Metric Values ##########
def extract_values(query, retrieved_text):
    reranked_docs = [doc for doc in retrieved_text["documents"][0]]
    response_text = generate_response(query, reranked_docs)

    # Self-Check GPT Hallucination Detection
    corrected_response, confidence_score = check_hallucination(query, response_text, reranked_docs)

    return {
        "original_text": response_text,
        "corrected_text": corrected_response,
        "confidence_score": confidence_score
    }

########## Compute ESG Score ##########
def compute_score(extracted_values, thresholds):
    """Compute ESG Score based on extracted values and thresholds."""
    return extracted_values["confidence_score"] * 10  # Example scoring logic

########## Process ESG Data ##########
df_columns = ["Company"] + list(esg_metrics[0].keys())
df_metrics = pd.DataFrame(columns=df_columns)

companies = ["Pfizer", "Apple", "Datadog"]

for company in companies:
    row_data = {"Company": company}

    for metric_entry in esg_metrics:
        metric_name = list(metric_entry.keys())[0]
        details = metric_entry[metric_name]

        query = details["query"]
        scoring_thresholds = details["scoring_query"]

        # Retrieve ESG text using ChromaDB
        retrieved_text = retrieve_esg_text(company, query)

        # Extract values
        extracted_data = extract_values(query, retrieved_text)

        # Compute score
        score = compute_score(extracted_data, scoring_thresholds)

        # Store results
        row_data[metric_name] = {
            "original_text": extracted_data["original_text"],
            "corrected_text": extracted_data["corrected_text"],
            "confidence_score": extracted_data["confidence_score"],
            "score": score
        }

    # Use `pd.concat()` Instead of `.append()`
    df_metrics = pd.concat([df_metrics, pd.DataFrame([row_data])], ignore_index=True)

# Save DataFrame to CSV
df_metrics.to_csv("company_esg_scores.csv", index=False)
