# Scoring with RAG

In [None]:
import os
import ocrmypdf
from PyPDF2 import PdfReader
import re
import json
import fitz  # PyMuPDF for PDF extraction
import chromadb  # Vector Database
from tqdm import tqdm
import logging
import requests
import time
import pandas as pd
import numpy as np
import torch 
from torch import nn
from torch.optim import AdamW  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd 
from io import BytesIO
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, pipeline, BertTokenizer, BertModel, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy import displacy
### if u got strong gpu w cuda, should change to gpu, average laptop cpu takes too long *cough* mac book users
torch.set_default_device("cpu")

if torch.cuda.is_available():
    torch.set_default_device("cuda")
    print("running on cuda")
import random
import json
import google.generativeai as genai
google_api_key = ""
from openai import OpenAI
API_KEY = "" 
import openai
deepseek_api_key = ""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from googlesearch import search
from fuzzywuzzy import fuzz 
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# RAG testing 

In [46]:
df = pd.read_csv("../files/labeled_pdfs_1003.csv")

In [47]:
client = chromadb.PersistentClient(path="./chromadb_1003")  # Stores DB in ./chroma_db
collection = client.get_or_create_collection(name="dsa3101")
logging.basicConfig(level=logging.WARNING)

for index, row in tqdm(df.iterrows(), total=len(df), desc="Adding documents", unit="document", leave=True, ncols=100):
    doc_text = row["esg_text"]  
    doc_company = row["company"]  
    doc_year = row["year"]  
    doc_industry = row["industry"]
    doc_id = f"doc_{index}"  

    collection.add(
        ids=[doc_id], 
        documents=[doc_text],  
        metadatas=[{"company": doc_company, "year": doc_year}] 
    )

Adding documents:   0%|                                     | 8/63903 [00:00<58:53, 18.08document/s]


KeyboardInterrupt: 

In [None]:
print(collection.count())  

63903


In [None]:
results

{'ids': [['doc_51558', 'doc_51413', 'doc_51407', 'doc_51406', 'doc_51420']],
 'embeddings': None,
 'documents': [['—> Continue reading on page 13  Reduced overall  emissions by 40%  In fiscal year 2021, our environmental  initiatives avoided over 23 million metric  tons of emissions across all scopes, and  we reduced our carbon footprint by  40 percent compared with fiscal year  2015.',
   'Without the methodology  change, these emissions would have increased by 14 percent, which reflects  the growth in our business.',
   'In fiscal year 2017, we started calculating scope 3 emissions not listed in  this table.',
   "Beginning in FY2021, we're accounting for scope 2 emissions from the  purchase of district heating, chilled water, and steam.",
   'When using the  same level of data granularity and model as 2021, our product use carbon  emissions in 2021 would have been about 2.5 percent lower.']],
 'uris': None,
 'data': None,
 'metadatas': [[{'company': 'Apple', 'year': 2022.0},
   {'co

## Testing the Generator
Setting up LLM API 

In [127]:
import httpx
genai.configure(api_key=google_api_key)
llm_genai = genai.GenerativeModel('gemini-2.0-flash')

llm_openai = OpenAI(
    # This is the default and can be omitted
    api_key=API_KEY,
    http_client= httpx.Client()
)
#llm_deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")


reranker_model_name = "BAAI/bge-reranker-base"
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name)
reranker_model.eval()  

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [128]:
def generate_genai_response(query, reranked_docs):

    """Retrieve context from ChromaDB and generate an answer using DeepSeek."""
    
    context = "\n\n".join(reranked_docs)

    prompt = f"""You are an expert in ESG analysis. Please reason through step by step and then provide the final answer to the query. 
    Please verify your answer against the context provided, and rewrite the answer if inconsistent. Below is a question and relevant retrieved documents.
    
    Question: {query}

    Context:
    {context + "End of Context"}

    Please provide a factually accurate response. If a fact is used from a document, include 'ids' next to it, like this. *information* ("doc_xxxxx")
    """
    retries = 3
    while retries > 0:
        try:
          completion = llm_genai.generate_content(prompt)

          if completion and completion.text:
                return {"text": completion.text.strip()}
          else:
                print("Empty completion received. Retrying...")
                retries -= 1
                time.sleep(2)
        except Exception as e:
            print(f"API Error encountered: {e}. Retrying after delay...")
            retries -= 1
            time.sleep(5)

    return "API Error: Unable to generate response after retries."

def generate_openai_response(query, reranked_docs):
    print("Chatgpttt")

    """Retrieve context from ChromaDB and generate an answer using DeepSeek."""
    
    context = "\n\n".join(reranked_docs)

    prompt = f"""You are an expert in ESG analysis. Please reason through step by step and then provide the final answer to the query. 
    Please verify your answer against the context provided, and rewrite the answer if inconsistent. Below is a question and relevant retrieved documents.
    
    Question: {query}

    Context:
    {context + "End of Context"}

    """
    retries = 3
    while retries > 0:
        try:
            response = llm_openai.chat.completions.create(
                model="gpt-3.5-turbo",  # or your preferred model
                messages=[
                    {"role": "assistant", "content": "You are an expert in ESG analysis looking through several documents"},
                    { "role": "user", "content": prompt},
                ],
            )
            if response and response.choices:
                return {"text": response.choices[0].message.content.strip()}
            else:
                print("Empty completion received. Retrying...")
                retries -= 1
                time.sleep(2)
        except Exception as e:
            print(f"API Error encountered: {e}. Retrying after delay...")
            retries -= 1
            time.sleep(5)
    return "API Error: Unable to generate response after retries."

'''def generate_deepseek_response(query, reranked_docs):
    print("deepseeking")

    """Retrieve context from ChromaDB and generate an answer using DeepSeek."""
    
    context = "\n\n".join(reranked_docs)

    prompt = f"""You are an expert in ESG analysis. Please reason through step by step and then provide the final answer to the query. 
    Please verify your answer against the context provided, and rewrite the answer if inconsistent. Below is a question and relevant retrieved documents.
    
    Question: {query}

    Context:
    {context + "End of Context"}

    Please provide a factually accurate response. If a fact is used from a document, include 'ids' next to it, like this. *information* ("doc_xxxxx")
    """
    retries = 3
    while retries > 0:
        try:
            response = llm_deepseek.completions.create(
                engine="deepseek/deepseek-r1-zero:free",  # or your preferred model
                messages=[
                    { "role": "user", "content": prompt},
                ],
                temperature=0.7,
            )
            if response['choices'][0] and response['choices'][0]['text']:
                return {"text": response['choices'][0]['text']}
            else:
                print("Empty completion received. Retrying...")
                retries -= 1
                time.sleep(2)
        except Exception as e:
            print(f"API Error encountered: {e}. Retrying after delay...")
            retries -= 1
            time.sleep(5)
    return "API Error: Unable to generate response after retries."'''

def rerank_documents(query, retrieved_docs):
    """
    Reranks the retrieved documents based on relevance scores using the BAAI/bge-reranker-base model.

    Args:
        query (str): The search query.
        retrieved_docs (list): A list of retrieved document texts.

    Returns:
        list: The reranked documents sorted by relevance.
    """
    if not retrieved_docs:
        return []

    # tokenise the query? = solve the ranking
    query_inputs = reranker_tokenizer(query, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        query_outputs = reranker_model(**query_inputs,output_hidden_states=True)
    # For simplicity, use the [CLS] token (first token) as the query embedding.
    query_embedding = query_outputs.hidden_states[-1][:, 0]
    
    doc_inputs = reranker_tokenizer(retrieved_docs, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        doc_outputs = reranker_model(**doc_inputs,output_hidden_states=True)
    # Use the [CLS] token embedding for each document.
    doc_embeddings = doc_outputs.hidden_states[-1][:, 0]

    # Compute relevance scores
    similarities = F.cosine_similarity(query_embedding, doc_embeddings, dim=-1)  # shape: (num_docs,)

    # Sort retrieved docs by relevance score (descending order)
    sorted_indices = similarities.argsort(descending=True)
    reranked_docs = [retrieved_docs[i] for i in sorted_indices.tolist()]
    print( reranked_docs)
    return reranked_docs


# Start of full RAG code with json

In [142]:
with open("../files/Scoring_revised.json", "r") as file:
    esg_metrics = json.load(file)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chromadb_1003")
collection = chroma_client.get_or_create_collection(name="dsa3101")

# Initialize empty DataFrame
headers = [list(item.keys())[0] for item in esg_metrics]
df_columns = ["Company"] + headers  # One column per ESG metric
df_metrics = pd.DataFrame(columns=df_columns)

# Function to retrieve relevant ESG text using ChromaDB
def retrieve_esg_text(company, query):
    collection = chroma_client.get_collection(name="dsa3101")
    results = collection.query(query_texts=[query], n_results=15,where={"company": company})
    return results

# Function to rerank retrieved documents
def get_reranked_docs(query, results):
    retrieved_docs = [doc for doc in results["documents"][0]]
    reranked_docs = rerank_documents(query, retrieved_docs)
    return reranked_docs
    
# Function to extract metric values using DeepSeek  
def extract_values(query, results):
    reranked_docs = get_reranked_docs(query, results)
    print(rerank_documents)
    response = generate_openai_response(query, reranked_docs)
    return response  
# Function to compute the score based on thresholds
def compute_linear_score(extracted_values, scoring_query):
    score = generate_openai_response(scoring_query, str(extracted_values))
    return score

csv_file = "company_esg_scores.csv"

# Load existing CSV if it exists, otherwise initialize an empty DataFrame.
if os.path.exists(csv_file):
    df_existing = pd.read_csv(csv_file)
    existing_companies = set(df_existing["Company"].tolist())
else:
    df_existing = pd.DataFrame(columns=df_columns)
    existing_companies = set()    

# List of companies
companies = ["Apple","Morgan_Stanley","Pfizer","JohnsonControl","Origin"]  #  Fit in companies u want to do it for 
new_rows = []


# Process each company
for company in companies:
    if company in existing_companies:
        print(f"Company '{company}' already processed; skipping.")
        continue

    row_data = {"Company": company}

    #for metric_dict in esg_metrics:
    for metric_item in esg_metrics:
        for metric, details in metric_item.items():
            metric_name = metric  # Get the metric name
            query = details["query"]
            scoring_thresholds = details["scoring_query"]

            # Retrieve ESG text using ChromaDB
            retrieved_text = retrieve_esg_text(company, query)

            # Extract values using DeepSeek
            extracted_values = extract_values(query, retrieved_text)

            # Compute score
            score = compute_linear_score(extracted_values, scoring_thresholds)

        # Store results
        row_data[metric_name] = {"extracted_values": extracted_values, "score": score}
    print("row_data: " + str(row_data))
    new_rows.append(row_data)
print('new_rows:  ' + str(new_rows))



# Save DataFrame to CSV
if len(new_rows) != 0:
    df_new = pd.DataFrame(new_rows)
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)
    df_combined.to_csv(csv_file, index=False)
    print(f"Added {len(new_rows)} new companies to {csv_file}.")
else:
    print("No new companies to process.")



Company 'Apple' already processed; skipping.
['Emission source activity and the related GHG emissions calculations that take place external to the third-party software have their results loaded into the software to maintain a single record of GHG emissions for all scopes.', 'For an in-depth explanation of this metric, please see the Percent reduction in lending intensity metric compared to the 2019 base year SECTOR 2030 REDUCTION TARGET Auto Manufacturing Energy Power 35 The 2022 Absolute Financed Emissions information was subject to Deloitte & Touche LLP’s review.', 'However, we continue to see an overall lower level of reported scope 3 emissions data from companies compared to scopes 1 and 2.', 'On an annual basis, Morgan Stanley calculates the firm’s GHG emissions inventory: CO, (Carbon Dioxide), CH, (Methane), N,O (Nitrous Oxide), HFCs (Hydrofluorocarbons).', '- Climate Policy - Trade Associations - Industry Memberships - Climate Metrics and Targets - Our Climate Ambitions - Sustai

### Vector database
When you store documents in ChromaDB using collection.add(), it:

1. Generates vector embeddings for your text (if you haven't provided them).
2. Stores the document along with its embedding in the vector database.
3. Matches queries based on similarity search (cosine similarity by default).


# Post processing

Checking for hallucination, irrelevance, bias 
In this assignment, I felt that biasness wasn't really a metric required, I think it would be good to add biasness if i extracted data from third party sources grading the company esg scores. I can then compare the third-party metrics and scoring to each company's esg reports, and check if there is biasness in terms of their ratings, towards a particular, company or industry, etc. Therefore, I just added the metric for future reference, but it is not required in this assignment.

### Hallucination detection (Faithfullness)

In [None]:
def normalize_text(text):
    """Normalize text by converting to lowercase and removing punctuation."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def fuzzy_match(sentence, doc, threshold=80):
    """Check if sentence has a fuzzy match in the document."""
    return fuzz.partial_ratio(normalize_text(sentence), normalize_text(doc)) >= threshold

def verify_facts(response, reranked_docs, fuzzy_threshold=80):
    """Detect hallucinations by checking if sentences exist in retrieved docs using fuzzy matching."""
    missing_facts = []
    
    # Split response into sentences and check if they appear in any of the documents
    for sent in response.split(". "):
        found = any(fuzzy_match(sent, doc, fuzzy_threshold) for doc in reranked_docs)
        if not found:
            missing_facts.append(sent)

    if missing_facts:
        print("Warning: Some statements are not found in the retrieved context:")
        for fact in missing_facts:
            print(f"- {fact}")
    
    return 1 - len(missing_facts) / len(response.split(". "))  # Faithfulness Score

faithfulness_score = verify_facts(response, reranked_documents)
print(f"Faithfulness Score: {faithfulness_score}")

## Irrelevance Check

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def check_relevance(query, response, threshold=0.6):
    """
    Check the relevance of the response to the query using semantic similarity.
    """
    query_embedding = model.encode([query])
    response_embedding = model.encode([response])

    similarity = cosine_similarity(query_embedding, response_embedding)[0][0]

    if similarity >= threshold:
        return True, similarity  
    else:
        return False, similarity  

In [None]:
is_relevant = check_relevance(query, response)
print(f"Is the response relevant? {is_relevant}")

In [None]:
model_name = "BAAI/bge-reranker-base" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def check_bias(text):
    """
    Check for potential bias in the text using a pretrained model.
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    # Assuming binary classification (0 = no bias, 1 = biased)
    predicted_class = torch.argmax(logits, dim=1).item()
    
    return predicted_class == 1  # 1 indicates bias (this depends on the model's labeling)

# Example Usage
response = "Pfizer has been focusing on improving diversity in their clinical trials and sharing their insights with others as part of their diversity and inclusion initiatives in 2022."

is_biased = check_bias(response)
print(f"Is the response biased? {is_biased}")

# Evaluation

## Retriever Evaluation
Typical metrics: RecalL@k, Precision @k, Mean Reciprocal Rank, Mean Average Precision

### Cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_retrieval_relevance(query, reranked_docs):
    """Compute semantic similarity between query and retrieved docs."""
    corpus = [query] + reranked_docs
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_scores = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:])
    return similarity_scores.mean()  # Average similarity

In [None]:
compute_retrieval_relevance(query, reranked_documents)

## Generator Evaluation 
Typical metrics: ROUGE, BLEU, BERTScore, domain-specific or task-specific metrics

### BLEU Score (Text similarity)

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def compute_bleu_score(reference, generated_response):
    """Compare generated response against reference text using BLEU score."""
    reference_tokens = reference.lower().split()
    generated_tokens = generated_response.lower().split()
    return sentence_bleu([reference_tokens], generated_tokens)

In [None]:
compute_bleu_score(query, response)

### Retrieval score (relevance)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_retrieval_relevance(reranked_docs, response):
    """Calculate how relevant the response is to the retrieved documents."""
    corpus = reranked_docs + [response]  # Combine all docs and response
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_matrix = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    return similarity_matrix.mean()  # Average similarity score


In [None]:
compute_retrieval_relevance(reranked_docs, response)

### Judge LM 

In [None]:
client = genai.Client(api_key=google_api_key)
reranked_docs_str = "\n".join(reranked_documents)

gemini_eval = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=f"""
                Evaluate how well the response answers the query, giving an explanation of how it answers the question, and whether the response is factually correct based on the context provided.
                I have added the query, response and retrieved context below.
                
                Query: 
                {query}
                
                Response:
                {response}
                
                Retrieved Context:
                {reranked_docs_str}
                
                Give a score from 0 to 10, and a detailed explanation on the score, where:
                - 10 = Perfectly accurate
                - 0 = Completely incorrect
                """
)

print(gemini_eval.text)