In [2]:
import pandas as pd
import sys
sys.path.append('../pipeline_v2/')
import main

* 'fields' has been removed


In [3]:
import os
def load_data(seed, model, num_samples=50):
    if os.path.exists(f'results_v2_{model}.pkl'):
        df = pd.read_pickle(f'results_v2_{model}.pkl')
    elif os.path.exists(f'./benchmark/results_v2_{model}.pkl'):
        df = pd.read_pickle(f'./benchmark/results_v2_{model}.pkl')
    else:
        raise ValueError(f"results_v2_{model}.pkl not found")

    sample_df = df.sample(num_samples, random_state=seed).reset_index(drop=True)
    return sample_df

In [4]:
def generate_text(model = 'gemini'):
    texts = []
    sample_df = load_data(42, model)
    for i in range(len(sample_df)):
        text_printed = """
    # Instructions
    ## 📝 For Each Statement
    ### Step 1: Review the Analysis
    - ✅ Check the LLM's verdict
    - 💭 Examine reasoning provided
    - 📑 If needed, expand individual claims below to examine evidence

    ### Step 2: Rate Agreement Level
    Choose one:
    - 🌟 **STRONGLY AGREE**: Perfect verdict & reasoning
    - ✅ **AGREE**: Mostly correct analysis
    - ⚠️ **DISAGREE**: Significant issues found
    - ❌ **STRONGLY DISAGREE**: Completely incorrect

    ### Step 3: If Disagreeing, Select Why
    - 🔍 **IRRELEVANT/INCORRECT EVIDENCE**: Wrong evidence retrieved
    - 🤔 **INCORRECT ANALYSIS**: Evidence interpreted incorrectly

    ### RESULTS:
        """
        row = sample_df.iloc[i]
        results = row[f'{model}_pipeline_results']
        result = results[0]
        claims = result['claims']
        statement_text = f"On {row['statement_date']}, {row['statement_originator']} claimed: {row['statement']}"
        reasoning = result['reasoning'].replace('\n', ' ')
        
        text_printed += f"Statement Evaluation {i + 1}\n"
        text_printed += f"Statement: {statement_text}\n"
        text_printed += f"Overall Verdict: {row['pipeline_pass3_verdict']}\n"
        text_printed += f"Overall Confidence: {result['confidence']}\n"
        text_printed += f"Overall Reasoning: {reasoning}\n\n"

        text_printed += f"Claims Extracted & Independently Verified:\n"

        for i, claim in enumerate(claims):
            text_printed += f"Claim {i + 1}/{len(claims)}: {claim.text}\n"
            text_printed += f"Verdict: {claim.verdict}\n"
            text_printed += f"Confidence: {claim.confidence}\n"
            text_printed += f"Reasoning: {claim.reasoning}\n"

            for component in claim.components:
                answer_text = component.answer.text
                answer_text = answer_text.replace('\n', ' ')
                text_printed += f"Question: {component.question}\n"
                text_printed += f"Answer: {component.answer.text}\n"

                if component.answer.citations:
                    text_printed += f"Explicit citations by the model:\n"
                    for j, citation in enumerate(component.answer.citations, 1):
                        if citation: 
                            text_printed += f"[{j}] {citation.snippet}\n"
                            text_printed += f"{citation.source_title}  - {citation.source_url}\n"
                
                text_printed += '\n'
                if component.answer.retrieved_docs:
                    text_printed += f"Documents used to synthesize answer (implicit citations):\n"
                    for k, doc in enumerate(component.answer.retrieved_docs, 1):
                        text_printed += f"[{k}] {doc.content if doc.content else ''}\n"
                        text_printed += f"{doc.metadata.get('title', '') if doc.metadata else ''} - {doc.metadata.get('url', '') if doc.metadata else ''}\n"
                text_printed += '\n'
        text_printed += """Return a Python list containing three things and NOTHING Else: 
        (1) agreement level (one of the following: STRONGLY AGREE, AGREE, DISAGREE, STRONGLY DISAGREE), 
        (2) if disagreeing, select why (IRRELEVANT/INCORRECT EVIDENCE, INCORRECT ANALYSIS, or None)
        (3) other elaboration, reasoning, or comments"""
        texts.append(text_printed)
    return texts

In [5]:
from langchain_community.llms.ollama import Ollama
from langchain.prompts import ChatPromptTemplate

In [6]:
# prompt_template = ChatPromptTemplate.from_template(template)
# prompt = prompt_template.format(context=None, question=query_text)
model = Ollama(model="llama3.2")

  model = Ollama(model="llama3.2")


In [7]:
from tqdm.notebook import tqdm
import pickle
gemini_texts = generate_text(model = 'gemini')
gemini_evals = []
for i in tqdm(range(len(gemini_texts))):
    response_text = model.invoke(gemini_texts[i])
    gemini_evals.append(response_text)
    with open("llama_eval_gemini.pkl", 'wb') as f:
        pickle.dump(gemini_evals, f)

  0%|          | 0/50 [00:00<?, ?it/s]

In [9]:
from tqdm.notebook import tqdm
import pickle
mistral_texts = generate_text(model = 'mistral')
mistral_evals = []
for i in tqdm(range(len(mistral_texts))):
    response_text = model.invoke(mistral_texts[i])
    mistral_evals.append(response_text)
    with open("llama_eval_mistral.pkl", 'wb') as f:
        pickle.dump(mistral_evals, f)

  0%|          | 0/50 [00:00<?, ?it/s]

In [63]:
import pickle
import ast
with open('llama_eval_gemini.pkl', 'rb') as f:
    gemini_copy = pickle.load(f)
with open('llama_eval_mistral.pkl', 'rb') as f:
    mistral_copy = pickle.load(f)

In [22]:
with open('llama_eval_gemini.pkl', 'wb') as f:
    pickle.dump(gemini_copy, f)
with open('llama_eval_mistral.pkl', 'wb') as f:
    pickle.dump(mistral_copy, f)

In [12]:
import ast
mistral_copy = mistral_evals[:]
for i, m in enumerate(mistral_copy):
    m = mistral_copy[i]
    if type(m) == list:
        continue
    start = m.find('[')
    end = m.find(']')+1
    m = m[start:end]
    # m = m.replace("['",'["').replace("']",'"]').replace(", '", ', "').replace("',", '",')
    # m = m.replace("[\n",'["').replace("\n]",'"]').replace(",\n", '",').replace(', ', ', "')
    try:
        m = ast.literal_eval(m)
    except Exception as e:
        print(f"Error at idx {i}, {e}")
    mistral_copy[i] = m

Error at idx 2, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 3, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 5, invalid syntax. Perhaps you forgot a comma? (<unknown>, line 2)
Error at idx 10, unterminated string literal (detected at line 4) (<unknown>, line 4)
Error at idx 16, invalid syntax (<unknown>, line 0)
Error at idx 29, invalid syntax (<unknown>, line 0)
Error at idx 30, invalid syntax (<unknown>, line 0)
Error at idx 33, unterminated string literal (detected at line 2) (<unknown>, line 2)
Error at idx 39, invalid syntax (<unknown>, line 0)
Error at idx 41, invalid syntax. Perhaps you forgot a comma? (<unknown>, line 1)
Error at idx 44, invalid syntax (<unknown>, line 0)


In [13]:
gemini_copy = gemini_evals[:]
for i, m in enumerate(gemini_copy):
    m = gemini_copy[i]
    start = m.find('[')
    end = m.find(']')+1
    m = m[start:end]
    m = m.replace("['",'["').replace("']",'"]').replace(", '", ', "').replace("',", '",')
    try:
        m = ast.literal_eval(m)
    except Exception as e:
        print(f"Error at idx {i}, {e}")
    gemini_copy[i] = m

Error at idx 3, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 5, unterminated string literal (detected at line 2) (<unknown>, line 2)
Error at idx 6, unterminated string literal (detected at line 2) (<unknown>, line 2)
Error at idx 7, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 14, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 16, unterminated string literal (detected at line 2) (<unknown>, line 2)
Error at idx 22, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 24, malformed node or string on line 2: <ast.Name object at 0x16a592fb0>
Error at idx 28, unterminated string literal (detected at line 1) (<unknown>, line 1)
Error at idx 32, invalid syntax. Perhaps you forgot a comma? (<unknown>, line 2)
Error at idx 44, invalid syntax (<unknown>, line 0)
Error at idx 47, unterminated string literal (detected at line 1) (<unknown>, line 1)


In [None]:
gemini_copy = gemini_evals[:]
for i, m in enumerate(gemini_copy):
    corrected = m
    if m[0] != None:
        continue
    while type(corrected) != list or m[0] == None: 
        print(m)
        print(f'index {i}: enter correction:')
        x = input()
        corrected = ast.literal_eval(x)
    gemini_copy[i] = corrected
    print("="*30)