## Prep. of R4C Data
### This notebook contains the extraction and processing of data from R4C raw files.

## Imports and Setup

In [1]:
import json
import random
from pathlib import Path
from typing import List, Dict, Any

from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [2]:
# Define paths
RAW_DIR = Path('./raw')
CLEAN_DIR = Path('./clean')

In [3]:
# Ensure clean directory exists
CLEAN_DIR.mkdir(exist_ok=True)

In [4]:
import os
import json
import getpass


import os
import json
import getpass

def set_api_key(file_path: str, api_name: str):
  if file_path == "":
        file_path = "/Users/radoslawizak/Desktop/MSc Data Science/Research project/keys.json"
  # Current APIs: "OpenAI", "Anthropic", "LangChain", "Tavily"
  with open(file_path, 'r') as file:
      api_keys = json.load(file)
        
  for api in api_keys:
      if api["name"].lower() == api_name.lower():
          api_key = api["key"]
          env_var_name = f"{api_name.upper()}_API_KEY"
          os.environ[env_var_name] = api_key

          if api_name.lower() == "langchain":
              os.environ["LANGCHAIN_TRACING_V2"] = "true"

          if not os.environ.get(env_var_name):
              os.environ[env_var_name] = getpass.getpass(f"{env_var_name}: ")
          break
  else:
      print(f"API name '{api_name}' not found in the provided file.")
file_path = "/Users/radoslawizak/Desktop/MSc Data Science/Research project/keys.json"

### LLMs
set_api_key("", api_name="OpenAI")
set_api_key("", api_name="Anthropic")

# Tracing
set_api_key("", api_name="LangChain")

In [5]:
# Initialize LLMs (assumes API keys have been set -> check utils)
openai_llm = ChatOpenAI(model="gpt-4o", temperature=0.7)
anthropic_llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.7)

In [6]:
# Set random seed for reproducibility
random.seed(42)

## Helper Functions

In [7]:
def read_json(filename: str) -> Dict[str, Any]:
    """
    Read a JSON file and return its contents.

    Args:
    filename (str): Name of the file to read.

    Returns:
    Dict[str, Any]: Contents of the JSON file.
    """
    with open(RAW_DIR / filename, 'r') as f:
        return json.load(f)

def save_json(data: Any, filename: str):
    """
    Save data as a JSON file in the clean directory.

    Args:
    data (Any): Data to be saved.
    filename (str): Name of the file to save.
    """
    with open(CLEAN_DIR / filename, 'w') as f:
        json.dump(data, f, indent=2)

def process_dev_csf(dev_csf: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Process the dev_csf file to extract unique evidence sentences.

    Args:
    dev_csf (Dict[str, Any]): The contents of the dev_csf file.

    Returns:
    Dict[str, List[str]]: A dictionary with IDs as keys and lists of unique evidence sentences as values.
    """
    processed_data = {}
    for key, value in dev_csf.items():
        evidence_set = set()
        for item in value:
            for entry in item:
                evidence = entry[2]
                evidence_set.add(" ".join(evidence))
        processed_data[key] = list(evidence_set)
    return processed_data

def generate_claim(question: str, answer: str, llm: Any) -> str:
    """
    Generate a claim from the question and answer using the specified LLM.

    Args:
    question (str): The original question.
    answer (str): The original answer.
    llm (Any): The language model to use.

    Returns:
    str: The generated claim.
    """
    prompt = PromptTemplate(
        input_variables=["question", "answer"],
        template="Convert the following question and answer into a single and concise sentence claim. Do NOT introduce any new information, just paraphrase in simple language. Do NOT return ANY other additional text, just the claim:\nQuestion: {question}\nAnswer: {answer}\nClaim:"
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(question=question, answer=answer).strip()

def generate_explanation(evidence: List[str], claim: str, llm: Any) -> str:
    """
    Generate an explanation for the claim using the evidence and the specified LLM.

    Args:
    evidence (List[str]): The list of evidence sentences.
    claim (str): The claim to explain.
    llm (Any): The language model to use.

    Returns:
    str: The generated explanation.
    """
    prompt = PromptTemplate(
        input_variables=["evidence", "claim"],
        template="Using the following evidence, create a short and concise explanation (1-3 sentences) for the claim. Use simple and plain language. Do NOT introduce any new information, just combine what is given. Keep paraphrasing to a minimum. Do NOT return ANY other additional text, just the explanation:\nEvidence: {evidence}\nClaim: {claim}\nExplanation:"
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(evidence="\n".join(evidence), claim=claim).strip()

def process_evidence(evidence: List[List[str]]) -> List[str]:
    """
    Process the evidence to combine text without headlines.

    Args:
    evidence (List[List[str]]): The original evidence list.

    Returns:
    List[str]: Processed evidence list.
    """
    return [' '.join(item[1]) for item in evidence]

def remove_duplicate_info(evidence: List[str], llm: Any) -> List[str]:
    """
    Remove informationally duplicate entries from the evidence list using an LLM.

    Args:
    evidence (List[str]): The original evidence list.
    llm (Any): The language model to use.

    Returns:
    List[str]: Evidence list with informationally duplicate entries removed.
    """
    prompt = PromptTemplate(
        input_variables=["evidence"],
        template="Remove any informationally duplicate entries from the following list, including paraphrases or similar information. Do NOT change the content under any circumstances, only remove. Return only the unique informational content without any leading dashes or bullet points. Do NOT return ANY other additional text, just the evidence:\n{evidence}\nUnique information:"
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    unique_info = chain.run(evidence="\n".join(evidence)).strip().split("\n")
    return [info.strip().lstrip('- ') for info in unique_info if info.strip()]


## Data Processing

In [8]:
# Read input files
dev_csf = read_json('dev_csf.json')
hotpot = read_json('hotpot_dev_fullwiki_v1.json')

In [9]:
# Process dev_csf
processed_dev_csf = process_dev_csf(dev_csf)

# Create full data
full_data = {}

# Get the intersection of IDs from both datasets
common_ids = list(set(processed_dev_csf.keys()) & set(item['_id'] for item in hotpot))

# Randomly sample 200 IDs (or all if less than 100)
sample_size = min(200, len(common_ids))
sampled_ids = random.sample(common_ids, sample_size)

# Initialize evidence knowledge bases
evidence_kb = []
evidence_golden_kb = []
evidence_id_counter = 1

In [10]:
for item in hotpot:
    item_id = item['_id']
    if item_id in sampled_ids:
        # Randomly decide which model to use
        model_used = random.choice(["OpenAI", "Anthropic"])
        
        # Select the appropriate LLM based on the random choice
        llm = openai_llm if model_used == "OpenAI" else anthropic_llm

        # Generate claim using the selected model
        claim = generate_claim(item['question'], item['answer'], llm)
        
        # Process and remove informationally duplicate evidence using the same model
        evidence_golden = remove_duplicate_info(processed_dev_csf[item_id], llm)
        evidence = process_evidence(item['context'])
        
        # Generate explanation using the same model
        explanation = generate_explanation(evidence_golden, claim, llm)
        
        # Create evidence structures with IDs
        evidence_with_ids = []
        for ev in evidence:
            evidence_with_ids.append({
                "evidence_id": evidence_id_counter,
                "description": ev,
            })
            evidence_kb.append({
                "evidence_id": evidence_id_counter,
                "description": ev,
            })
            evidence_id_counter += 1
        
        evidence_golden_with_ids = []
        for ev in evidence_golden:
            evidence_golden_with_ids.append({
                "evidence_id": evidence_id_counter,
                "description": ev,
                "model_used": model_used
            })
            evidence_golden_kb.append({
                "evidence_id": evidence_id_counter,
                "description": ev,
                "model_used": model_used
            })
            evidence_id_counter += 1
        
        full_data[item_id] = {
            'claim': claim,
            'model_used': model_used,
            'explanation': explanation,
            'evidence_golden': evidence_golden_with_ids,
            'evidence': evidence_with_ids
        }


  warn_deprecated(
  warn_deprecated(


In [11]:
# Save full data
save_json(full_data, 'full_data.json')

# Save evidence knowledge bases
save_json(evidence_kb, 'evidence_kb.json')
save_json(evidence_golden_kb, 'evidence_golden_kb.json')

print(f"Data processing completed. Files saved in the clean directory:")
print(f"- full_data.json with {len(full_data)} items")
print(f"- evidence_kb.json with {len(evidence_kb)} items")
print(f"- evidence_golden_kb.json with {len(evidence_golden_kb)} items")

Data processing completed. Files saved in the clean directory:
- full_data.json with 200 items
- evidence_kb.json with 1972 items
- evidence_golden_kb.json with 590 items
