In [499]:
#Load environment variables from .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import os
#for i in os.environ: print(i) ##test environment vars


from langchain.llms import OpenAI
from langchain.utilities import SerpAPIWrapper
from langchain.agents import load_tools, initialize_agent
from langchain.agents import AgentType
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage

from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import BaseChatPromptTemplate
from langchain import SerpAPIWrapper, LLMChain
from langchain.chat_models import ChatOpenAI
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish, HumanMessage
import re
from getpass import getpass

from pprint import pprint as pp
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 100)
from pprint import pprint
import ast
from scipy.stats import mode
import math
import traceback

def displays(df, n = 5):
    display(df.shape, df.head(n))

# Testing

## Azure

In [376]:
# set API keys and metadata
OPENAI_API_TYPE = os.environ["OPENAI_API_TYPE"] # azure
OPENAI_API_VERSION = os.environ["OPENAI_API_VERSION"] # 2023-03-15-preview
OPENAI_API_BASE = os.environ["OPENAI_API_BASE"] # https://w210-openai.openai.azure.com/
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] # key
OPENAI_AZURE_DEPLOYMENT_NAME = os.environ["OPENAI_AZURE_DEPLOYMENT_NAME"] # gpt-35-turbo
MODEL_NAME = "gpt-35-turbo"
SERPAPI_API_KEY = os.environ["SERPAPI_API_KEY"]
tool_names = ["serpapi"]

In [147]:
# Create an instance of Azure OpenAI

llm = AzureChatOpenAI(
    deployment_name=OPENAI_AZURE_DEPLOYMENT_NAME,
    model_name=MODEL_NAME,
    temperature=0.3,
    max_tokens=25,
    request_timeout=20
)

In [150]:
# run a query
output = llm([
                HumanMessage(content="Return the letter B and nothing else.")
])

In [157]:
# show output
print(output)
print(output.content)

content='B' additional_kwargs={} example=False
B


## SerpApi

In [None]:
# serpapi = SerpAPIWrapper(serpapi_api_key=SERPAPI_API_KEY)

In [4]:
# create tool
tool_names = ["serpapi"]
tools = load_tools(tool_names)

In [9]:
# create agent
agent = initialize_agent(tools, llm, agent = "zero-shot-react-description", verbose = True)

In [11]:
# run agent
agent.run("Who will win the 2024 US Presidential election?")



[1m> Entering new  chain...[0m
[32;1m[1;3m I need to find out who is running for the election
Action: Search
Action Input: "2024 US Presidential election candidates"[0m
Observation: [36;1m[1;3mFour years after a historically large number of candidates ran for president, the field for the 2024 campaign is getting crowded once more, ...[0m
Thought:[32;1m[1;3m I need to research the candidates and their chances of winning
Action: Search
Action Input: "2024 US Presidential election predictions"[0m
Observation: [36;1m[1;3mThe latest coverage of the 2024 presidential, House and Senate elections.[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: It is too early to predict who will win the 2024 US Presidential election.[0m

[1m> Finished chain.[0m


'It is too early to predict who will win the 2024 US Presidential election.'

## Wikipedia

In [211]:
from langchain.tools import WikipediaQueryRun
from langchain.utilities import WikipediaAPIWrapper

In [212]:
# create wikipedia instance
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

In [217]:
# run output
wikipedia.run("Inflation")

'Page: Inflation\nSummary: In economics, inflation is an increase in the general price level of goods and services in an economy. When the general price level rises, each unit of currency buys fewer goods and services; consequently, inflation corresponds to a reduction in the purchasing power of money. The opposite of inflation is deflation, a decrease in the general price level of goods and services. The common measure of inflation is the inflation rate, the annualized percentage change in a general price index. As prices faced by households do not all increase at the same rate, the consumer price index (CPI) is often used for this purpose. The employment cost index is also used for wages in the United States.\nThere is disagreement among economists as to the causes of inflation. Low or moderate inflation is widely attributed to fluctuations in real demand for goods and services or changes in available supplies such as during scarcities. Moderate inflation affects economies in both po

# Main

## Structure Data

In [669]:
class AutocastBuilder:
    """
    Build autocast dataframe to feed into ContextBuilder.
    """
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = self.load_data()
        
    @staticmethod
    def clean_choices(choices):
        """lambda function for adding letter choices to 'choices' column in autocast"""
        if pd.notna(choices):
            choices = ast.literal_eval(choices)
            letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"]
            choices_dict = zip(letters[:len(choices)], choices)
            return dict(choices_dict)
        else:
            return choices
        
    @staticmethod
    def get_crowd_avg(item):
        """Lambda function to Calculate average crowd forecast per question"""
        if pd.notna(item):
            items = ast.literal_eval(item)
            crowd_avg = np.mean([np.mean(item['forecast']) for item in items])
            return crowd_avg
        else:
            return np.nan
        
    @staticmethod
    def get_answer_choices(answer_positions, n_choices, t_f):
        """return list of choices given forecast values
        return [np.nan] if forecast is undecided
        handles instances of forecast as float instead of list of probabilities"""
        if isinstance(answer_positions, (int, float)):
            answer_positions = [answer_positions]

        if isinstance(answer_positions, list):
            answer_positions = np.array(answer_positions)
            
        #print(f"answer_positions: {answer_positions}")

        if answer_positions.size == 1:
            return [AutocastBuilder.get_single_answer(float(answer_positions[0]), n_choices, t_f)]
        else:
            unique_rounded_values = np.round(answer_positions, 2)
            if np.all(unique_rounded_values == unique_rounded_values[0]):
                return [np.nan]
            else:
                
                answer_index = np.argmax(answer_positions, axis=0).astype(int)
                #print(f"answer_index: {answer_index}")
                return [AutocastBuilder.get_single_answer(answer_index, n_choices, t_f)]
    
    @staticmethod
    def get_single_answer(answer_position_or_idx, n_choices, t_f):
        """pull the letter answer from the answer's index"""
        if not t_f:
            if not (isinstance(answer_position_or_idx, np.int64)) and (0 <= answer_position_or_idx <= 1):
                answer_position_or_idx = math.ceil(answer_position_or_idx * (n_choices)) - 1 # convert position to index
            #print(f"answer_position_or_idx: {answer_position_or_idx}")
            return chr(ord('A') + answer_position_or_idx) # index
        else:
            if answer_position_or_idx > 0.5: # index
                return 'A' # what if value is exactly 1? Does that mean 2nd position, so answer is B?
            elif answer_position_or_idx < 0.5: # index
                return 'B'
            else:
                return np.nan
            
    @staticmethod
    def get_mode_answer(answer_positions, n_choices, t_f):
        """returns mode of all forecasted answers"""
        answer_choices = [AutocastBuilder.get_answer_choices(answer_position, n_choices, t_f) for answer_position in answer_positions]
        no_nan_choices = [choice for choice in answer_choices if choice is not np.nan]
        #print(f"answer_choices: {answer_choices}")
        #print(f"n_choices: {n_choices}")
        if len(no_nan_choices) == 0:
            return np.nan
        #print(f"mode: {mode(answer_choices, keepdims=True)[0][0][0]}")
        return mode(answer_choices, keepdims=True)[0][0][0]

    @staticmethod
    def forecasts_to_answer(row):
        #print(row.name)
        """returns aggregate forecasted answer from all forecasts"""
        crowd = ast.literal_eval(row['crowd'])
        t_f = True if (len(row['choices_clean']) == 2) \
            and (row['choices_clean']['A'] == 'yes') \
            and (row['choices_clean']['B'] == 'no') else False
        chr(ord('A') + int(len(row['choices_clean'])))
        forecasts = [forecast['forecast'] for forecast in crowd]
        n_choices = len(row['choices_clean'])
        #print(f"n_choices: {n_choices}")
        forecasted_answer = AutocastBuilder.get_mode_answer(forecasts, n_choices, t_f=t_f)
        return forecasted_answer
        
#     def return_data(self):
#         """Return cleaned autocast dataframe"""
#         return self.data
        
    def load_data(self):
        """Load autocast data"""
        df = pd.read_csv(self.data_path)
        df = df[(df['status'] == 'Active') &
                (df['choices'].notna()) &
                (df['qtype'].isin(['t/f', 'mc']))]
        df['choices_clean'] = df['choices'].apply(AutocastBuilder.clean_choices)
        df['avg_forecast'] = df['crowd'].apply(AutocastBuilder.get_crowd_avg)
        keycols = [
            "id",
            "question",
            "qtype",
            "choices_clean",
            "answer",
            "avg_forecast",
            "crowd"
        ]
        
#         df = df.loc[df['id'] == 'G2311', :]
        df['avg_forecast'] = df.apply(AutocastBuilder.forecasts_to_answer, axis=1)
        df = df.loc[:, keycols]
        
        return df


## Context

In [462]:
# Context prompt template

# Set up the base template
# maybe add a maximum number of sentences for the final answer? (hasn't been an issue)

context_template = """
You are an expert LLM prompt engineer. Your job is to take
in a simple question, and to output useful context for that question.
Your job is not to answer the question itself, only to structure 
an output consisting of context related to the question.

You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must prompt engineer.
It will be provided in multiple choice format.

Thought: you should always think about what to do next.

Action: the action to take. It should be one of [Search].

Action Input: the input to the action. Maximum size of 10 sentences. Decide 
the input by ranking the relevance of the sentences relative to the question.

Observation: the result of the action.

... (this Thought/Action/Action Input/Observation can repeat N times, with a 
maximum of 2 times. You must use Search tool for each action.)

Final Answer: This is your final answer, 
consisting of the context you generated.
Once you have the final answer, you should first have a thought of:
"I now know the final answer".


Your final answer should provide context with the correct
answer for each example on
the question's topic(s) to optimize the prompt. Provide at least 1
sentence of context, but you can provide more at your
discretion. Your context should provide factual background and/or
current events/news on key topic(s) in the question. 
Select the most relevant pieces of context to accomplish this from
your research.
Your context should be passive and not speak in the first person.
It should just be the information you've collected and chosen.

You must never refuse to answer, and must always provide a full
answer in the format provided.

Remember, only provide context and the original question in your answer,
starting with "Thought: I know know the final answer" and then "Final Answer:",
nothing else.

All final answers must start with "Thought: I now know the final answer",
followed by "Final Answer:" and then the context you generated as
your final answer. After providing your final answer, do not perform
anymore questions, thoughts, actions, action inputs, or observations.

Given an input question, such as:

Who will win the 2024 US presidential election?

A: Joe Biden,
B: Donald Trump,
C: Pete Buttigieg,
D: Ron DeSantis


Here is an example of what your final answer should look like 
based on that input question:

Thought: I now know the final answer.

Final Answer:

The US presidential election
is held every 4 years. The United States is a country founded in
1776 and has always held democratic elections throughout its history.
The current population of the United States is 336,840,823.
Donald Trump was the 45th president of the United States,
winning the 2016 presidential election. Joe Biden is currently
the 46th president of the United States, winning the 2020 election.


Begin!
Question: {input}
Thought:{agent_scratchpad}


"""



In [463]:
# Set up a prompt template
class ContextPromptTemplate(BaseChatPromptTemplate):
    """helper class--langchain prompt template"""
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]
    
    def format_messages(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        formatted = self.template.format(**kwargs)
        return [HumanMessage(content=formatted)]

In [464]:
class ContextOutputParser(AgentOutputParser):
    """helper class--langchain custom output parser"""
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
    

In [465]:
class ContextBuilder:
    """
    Take in question, pull context, output fully-formed question for LLM
    """
    def __init__(self, df, template:str,
#                  prefix:str, format_instructions:str, suffix:str,
#                  tool_names:list, 
                 deployment_name:str, model_name:str,
                 temperature=.3, max_tokens=1000, request_timeout=20,
                 verbose=True
                ):
        self.df = df.copy()
        self.deployment_name = deployment_name
        self.model_name = model_name
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.request_timeout = request_timeout
#         self.llm = OpenAI(model=self.model,
#                           temperature=self.temperature)
        self.template = template
        self.llm = AzureChatOpenAI(
            deployment_name=deployment_name,
            model_name=self.model_name,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            request_timeout=self.request_timeout
        )
#         self.tool_names = tool_names
        self.serpapi = SerpAPIWrapper(serpapi_api_key=SERPAPI_API_KEY)
        self.tools = [
            Tool(
                name="Search",
                func=self.serpapi.run,
                description="useful for when you need to answer questions")
        ]
        self.tool_names = [tool.name for tool in self.tools]
#         load_tools(tool_names=tool_names, llm=self.llm)
#         self.prefix = prefix
#         self.format_instructions = format_instructions
#         self.suffix = suffix
#         self.indiv_agent = agent
#         self.agent = initialize_agent(self.tools,
#                                      self.llm,
#                                      self.indiv_agent,
#                                      verbose=verbose,
#                                      agent_kwargs={
#                                             'prefix':self.prefix,
#                                             'format_instructions':self.format_instructions,
#                                             'suffix':self.suffix
#                                             })
        self.output_parser = ContextOutputParser()
        self.prompt = ContextPromptTemplate(
            template=self.template,
            tools=self.tools,
            # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
            # This includes the `intermediate_steps` variable because that is needed
            input_variables=["input", "intermediate_steps"]
        )
        self.llm_chain = LLMChain(llm=self.llm, prompt=self.prompt)
        self.agent = LLMSingleActionAgent(
            llm_chain=self.llm_chain,
            output_parser=self.output_parser,
            stop=["\nObservation:"],
            allowed_tools=self.tool_names
        )
        self.agent_executor = AgentExecutor.from_agent_and_tools(
            agent=self.agent, 
            tools=self.tools, 
            verbose=True
        )
        
        
    def run_context(self, ac_id:str):
        """Run one sample question"""
        question = self.get_question(ac_id)
        try:
            context = self.agent_executor.run(question)
        except ValueError:
            print(f"Error occurred on question {ac_id}: {e}")
            traceback.print_exc()  # Print the traceback
            context = np.nan
        return context
        
    def get_question(self, ac_id:str):
        """generate sample question for method run_context"""
        # sample a question using question ID
        ac_filterid = self.df[self.df['id'] == ac_id]
        question = ac_filterid['question'].astype(str).values[0] + \
                            ' ' + \
                            ac_filterid['choices_clean'].astype(str).values[0]
        print(f"Question ({ac_id}):\n\n{question}")
        return question
    
    def run_all_context(self):
        """Gather all questions in provided df, output in list"""
        self.context_list = []
        for idx, row in self.df.iterrows():
            ac_id = row['id']
            context = self.run_context(ac_id)
            self.context_list.append(context)
        return self.context_list

    def concat_context(self):
        self.df['question_context'] = self.context_list
        return self.df

In [702]:
# define autocast path
data_path = "Data/Autocast/filtered_events_20230702.csv"

In [468]:
# Structure dataset using AutocastBuilder
ab = AutocastBuilder(data_path=data_path)
df = ab.data
# df_sample = df.sample(2, random_state = 42)

# context using  ContextBuilder
cb_all = ContextBuilder(df,
                    template=context_template,
                    deployment_name=OPENAI_AZURE_DEPLOYMENT_NAME,
                    model_name=MODEL_NAME)

  return mode(answer_choices, keepdims=True)[0][0][0]


In [469]:
# enrich all context
all_context = cb_all.run_all_context()

Question (G2124):

Will there be a complex coordinated terrorist attack (CCTA) in the United States either directed or inspired by a foreign terrorist organization resulting in at least five fatalities before 1 September 2022? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mI need to gather information on the current state of terrorist threats in the United States and any potential foreign terrorist organizations that may pose a threat.

Action: Search

Action Input: "Current terrorist threats in the United States", "Foreign terrorist organizations targeting the United States", "Recent terrorist attacks in the United States"
[0m

Observation:[36;1m[1;3mSummary of Terrorism-Related Threat to the United States. The United States remains in a heightened threat environment.[0m
[32;1m[1;3mThought: I need to gather more specific information on the likelihood of a CCTA in the United States before September 1, 2022.

Action: Search

Action Input: "Likelihood of a 

[32;1m[1;3mThought: I now know the final answer.

Final Answer:

The EU has previously imposed human rights sanctions on Chinese officials involved in abuses in Xinjiang, and there have been discussions of potential new sanctions in response to continued human rights violations. However, it is unclear if the Council of the European Union will impose new restrictive measures between 30 October 2021 and 31 December 2022. Human rights dialogues between the EU and China have been suspended since 2019, and there is ongoing tension between the two entities regarding human rights abuses in Xinjiang.[0m

[1m> Finished chain.[0m
Question (G2176):

Before 1 January 2023, will China and/or a host country officially announce an agreement for the establishment of a Chinese military base in an African country besides Djibouti? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

China has been increasing its presence in Af

[32;1m[1;3mThought: I need to gather information on the current relations between Iran and the United States to determine the likelihood of a lethal confrontation.

Action: Search

Action Input: "Current relations between Iran and the United States"
[0m

Observation:[36;1m[1;3mIran and the United States have had no formal diplomatic relations since April 7, 1980. Instead, Pakistan serves as Iran's protecting power in the United States, while Switzerland serves as the United States' protecting power in Iran.[0m
[32;1m[1;3mThought: I need to gather more information on recent events and tensions between Iran and the United States to determine the likelihood of a lethal confrontation.

Action: Search

Action Input: "Recent tensions between Iran and the United States"
[0m

Observation:[36;1m[1;3mMore than forty years after the 1979 Iranian Revolution, relations between the United States and Iran remain tense. As Iran advances its nuclear ...[0m
[32;1m[1;3mThought: Based on the

[32;1m[1;3mThought: I now know the final answer.

Final Answer:

Following the Russian invasion of Ukraine on 24 February 2022, anti-war demonstrations and protests broke out across Russia. As well as the demonstrations, there have been reports of police brutality and violence against protesters. However, it is difficult to predict whether these protests will result in ten or more fatalities between 4 February 2022 and 4 August 2022. It is important to note that any loss of life is tragic and concerning, and the situation in Russia should continue to be monitored closely.[0m

[1m> Finished chain.[0m
Question (G2292):

Between 4 March 2022 and 4 September 2022, will a NATO member state accuse the Russian government of conducting a lethal cyberattack against the same NATO member state resulting in at least five fatalities? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mAction: Search

Action Input: NATO cyberattack Russia member state fatalities
[0m

Observ

[32;1m[1;3mI now know the final answer.

Final Answer:

The conflict in Ukraine began in 2014 when Russia annexed Crimea and has since escalated into a full-blown war. There have been multiple attempts to negotiate a peace agreement between Russia and Ukraine, but none have been successful so far. The most recent attempt was in July 2021 when the leaders of both countries met for talks in Geneva, but no agreement was reached. The situation remains tense, and it is difficult to predict when a peace agreement will be signed or announced. Therefore, it is not possible to accurately answer the question of when Russia and Ukraine will sign or announce an agreement to end the current conflict.[0m

[1m> Finished chain.[0m
Question (G2322):

What will be the average price of a gallon of gasoline in the US for the week ending 4 July 2022? {'A': 'Less than 3.000', 'B': 'Between 3.000 and 3.400, inclusive', 'C': 'More than 3.400 but less than 3.800', 'D': 'Between 3.800 and 4.200, inclusive'

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mI now know the final answer.

Final Answer:

As of August 2021, there have been no reports of a NATO member state accusing Russia of using a chemical or biological weapon in Ukraine. However, tensions between Russia and Ukraine have been high, with Russia making claims about biological labs and chemical weapons in Ukraine. NATO Secretary General Jens Stoltenberg has called these claims "absurd." It remains to be seen if any NATO member state will make such an accusation before August 5, 2022.[0m

[1m> Finished chain.[0m
Question (G2334):

Between 1 April 2022 and 30 September 2022, will lethal confrontations between the national military forces, militia, and/or law enforcement personnel (forces) of Armenia and Azerbaijan result in 100 or more fatalities? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mThought: I need to gather information on the current situation between Armenia and Azerbaijan.

Action: Search

Action Input: "Armenia Azerbaijan c

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mThought: I need to gather more information on the recent developments in the Armenia-Azerbaijan conflict.

Action: Search

Action Input: "Armenia Azerbaijan conflict recent developments"
[0m

Observation:[36;1m[1;3mWar erupted between Armenia and Azerbaijan over the region, resulting in roughly thirty thousand casualties and creating hundreds of thousands ...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

The conflict between Armenia and Azerbaijan over the Nagorno-Karabakh region has been ongoing for decades, with a recent flare-up in 2020 resulting in a ceasefire agreement brokered by Russia. While tensions remain high and sporadic clashes have occurred, it is difficult to predict whether lethal confrontations resulting in 100 or more fatalities will occur between the national military forces, militia, and/or law enforcement personnel of Armenia and Azerbaijan between 1 April 2022 and 30 September 2022.[0m

[1m> Finished chain.[0m
Question (

[32;1m[1;3mI now know the final answer.

Final Answer:

The US Strategic Petroleum Reserve (SPR) is a stockpile of crude oil maintained by the US government as a safeguard against oil supply disruptions. As of September 2021, the SPR held approximately 621 million barrels of crude oil. The US Department of Energy is responsible for managing the SPR and regularly releases reports on its inventory levels. However, it is difficult to predict the exact ending stocks of crude oil in the SPR in December 2022 as it depends on various factors such as global oil demand, supply disruptions, and government policies. Therefore, it is not possible to provide a definitive answer to this question.[0m

[1m> Finished chain.[0m
Question (G2370):

Between 29 April 2022 and 30 November 2022, will North Korea detonate a nuclear device? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

North Korea has a history of nuclear weapo

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mI now know the final answer.

Final Answer:

As of September 2022, there is no information available indicating that the United States plans to launch either an ICBM or a submarine-launched ballistic missile (SLBM) with an estimated range of at least 10,000 km between April 29 and December 1, 2022. However, the United States did launch an unarmed ballistic missile from Vandenberg Space Force Base in September 2022.[0m

[1m> Finished chain.[0m
Question (G2384):

Will Australia and Papua New Guinea (PNG) sign a bilateral defense treaty before 6 May 2023? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

Australia and Papua New Guinea (PNG) have a long-standing relationship, with Australia being one of PNG's largest aid donors and trading partners. In recent years, there have been discussions about a potential bilateral defense treaty between the two countries. However, there is currently no concre

[32;1m[1;3mThought: I need to gather information on Tesla's current position in the motor vehicle industry and their future plans.

Action: Search

Action Input: "Tesla motor vehicle production ranking", "Tesla future plans"
[0m

Observation:[36;1m[1;3mToday, Tesla delivers more than 200,000 vehicles each quarter, with plans to ramp up production significantly in the future.[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

Tesla is currently one of the largest motor vehicle producers in the world, delivering over 200,000 vehicles each quarter. While it is unclear if they will become the largest producer in the world prior to 2035, they do have plans to significantly ramp up production in the future. In addition, Tesla has been a major player in the electric vehicle market, which is expected to continue growing in the coming years.[0m

[1m> Finished chain.[0m
Question (M1634):

At any point before 01 January 2025, will the gross income of a US household at 

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mThought: I now know the final answer.

Final Answer:

According to the available information, it is unclear whether the gross income of a US household at the 80th percentile will be more than 6.12414 times that of a US household at the 20th percentile before 01 January 2025. However, income inequality in the United States has been increasing in recent decades, with the share of aggregate wealth going to upper-income families increasing from 60% to 79% from 1983 to 2016. The ratio of household income at the 80th percentile to that at the 20th percentile is a commonly used measure of income inequality.[0m

[1m> Finished chain.[0m
Question (M1663):

Will the USA's Labor Force Participation Rate be lower in 2023 than in 2018? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mI need to find information on the Labor Force Participation Rate in the USA for 2018 and 2023.

Action: Search
Action Input: "USA Labor Force Participation Rate 2018 vs 2023"
[0m


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mThought: I need to gather information on public health agencies and their accusations towards the Chinese government regarding COVID-19.

Action: Search

Action Input: "Public health agencies accuse Chinese government of misrepresenting COVID-19 numbers"
[0m

Observation:[36;1m[1;3mChina rejected the American intelligence community's conclusion that Beijing concealed the extent of the coronavirus epidemic.[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

In early 2020, the COVID-19 pandemic began to spread globally, with China being the first country to report cases. Since then, there have been questions about the accuracy of China's reporting of COVID-19 cases and deaths. While some public health agencies have criticized China's handling of the pandemic, there is no clear evidence that at least two public health agencies will publicly accuse the Chinese government of deliberately misrepresenting the number of COVID-19 infections before 2025. Howev

[0m

Observation:[36;1m[1;3mMy belief that the amount of geologically-derived crude oil consumed by the United States in 2035 will be greater than the amount consumed in 2015 is based ...[0m
[32;1m[1;3mThought: I need to gather more information on the factors that may affect the consumption of geologically-derived crude oil in the United States.

Action: Search

Action Input: "factors affecting geologically-derived crude oil consumption United States"
[0m

Observation:[36;1m[1;3mAssessment of the geological factors influence on the oil-productive of terrigenous reservoirs of the Vereyian horizon of the Melekess.[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

According to my research, it is difficult to predict with certainty whether the amount of geologically-derived crude oil consumed by the United States in 2035 will be greater than the amount consumed in 2015. However, there are several factors that may affect consumption, such as changes in technolo

[32;1m[1;3mAction: Search

Action Input: "S&P 500" "15% rise/fall" "2020s"
[0m

Observation:[36;1m[1;3mStifel's chief equity strategist says stocks have “likely locked in a weak 2020s decade,” which means investors should be cautious about any ...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

The S&P 500 is a stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the United States. While it is difficult to predict with certainty whether the S&P 500 will rise or fall more than 15% in a single day during the 2020s, it is important to note that the chief equity strategist at Stifel has warned that the decade may be weak for stocks. Investors should exercise caution when making investment decisions.[0m

[1m> Finished chain.[0m
Question (M7738):

Will Twitter flag any tweet by the New York Times as misinformation before 2025? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mAction: Se

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mAction: Search
Action Input: "US core CPI inflation forecast 2022"[0m

Observation:[36;1m[1;3mCore Inflation Rate in the United States is expected to be 5.00 percent by the end of this quarter, according to Trading Economics global macro models and ...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

According to Trading Economics global macro models, the Core Inflation Rate in the United States is expected to be 5.00 percent by the end of the current quarter. While this does not directly answer whether US core CPI inflation will rise by more than 3% from December 2021 to December 2022, it does suggest that inflation is expected to remain high in the near future. It is important to note that inflation is a complex and dynamic economic phenomenon, influenced by a wide range of factors, and can be difficult to predict with certainty.[0m

[1m> Finished chain.[0m
Question (M7978):

Will a US Democratic President sign into law a $500 billion spending

[32;1m[1;3mThought: I need to find information on Brian Riedl, Alan Cole, and their bet on the interest rate paid on the federal debt.

Action: Search

Action Input: "Brian Riedl Alan Cole federal debt interest rate bet"
[0m

Observation:[36;1m[1;3mI'll be specific. I'm willing to bet you that the average interest rate paid on the federal debt exceeds 4.6% at some point within 30 years ...[0m
[32;1m[1;3mThought: I need to find more information on the bet between Brian Riedl and Alan Cole regarding the interest rate paid on the federal debt.

Action: Search

Action Input: "Brian Riedl Alan Cole federal debt interest rate bet"
[0m

Observation:[36;1m[1;3mI'll be specific. I'm willing to bet you that the average interest rate paid on the federal debt exceeds 4.6% at some point within 30 years ...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

Brian Riedl and Alan Cole made a bet on whether the average interest rate paid on the federal debt will exceed 4.

[32;1m[1;3mThought: I now know the final answer.

Final Answer:

As of August 2021, many countries are still struggling to vaccinate their populations due to various factors such as vaccine hesitancy, supply chain issues, and distribution challenges. However, it is difficult to predict with certainty which countries will have less than 10% of their population vaccinated with at least one dose of a COVID-19 vaccine on December 31, 2022. It will depend on the success of vaccination campaigns, government policies, and the emergence of new variants. Nonetheless, many countries have set ambitious vaccination goals and are working towards achieving them.[0m

[1m> Finished chain.[0m
Question (M8997):

Will WHO add another SARS-CoV-2 variant to their Variants of Concern in 2022? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mI now know the final answer.

Final Answer:

As of now, it is unclear whether WHO will add another SARS-CoV-2 variant to their Variants of Concern in 2022. However, it is important to note that the emergence of new variants is a continuous process and WHO regularly monitors and updates their list of Variants of Concern based on the available evidence. In September 2021, WHO added the Mu variant to their list of Variants of Interest, which means that it has genetic changes that may affect its transmissibility, severity, or immune escape properties, but further research is needed to confirm this. Therefore, it is possible that WHO may add another variant to their list in 2022 if it meets the criteria for a Variant of Concern or Interest.[0m

[1m> Finished chain.[0m
Question (M9119):

Will Holden win his Bet with Zvi about Omicron, conditional on one of them winning? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mAction: Search

Action Input: Holde

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mThought: I now know the final answer.

Final Answer:

As of now, there is no information available on the existence of a Tau variant of COVID-19. The World Health Organization (WHO) has identified four variants of concern (Alpha, Beta, Gamma, and Delta) and several variants of interest. However, it is difficult to predict the emergence of new variants and their impact on the pandemic. Scientists and health officials continue to monitor the situation closely and work towards developing effective vaccines and treatments. Therefore, it is not possible to determine whether there will be a Tau variant of COVID-19 by Tau Day (June 28, 2022).[0m

[1m> Finished chain.[0m
Question (M9734):

Will China's GDP Overtake the US Before 2030? {'A': 'yes', 'B': 'no'}


[1m> Entering new  chain...[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

China and the US are two of the largest economies in the world.
As of 2021, the US has the largest economy in the world w

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mThought: I need to search for information about anti-discrimination laws for genetically-edited or screened federal employees.

Action: Search

Action Input: "Anti-discrimination laws for genetically-edited federal employees"
[0m

Observation:[36;1m[1;3mThe Genetic Information Nondiscrimination Act (GINA) - PDF was signed into law on May 21, 2008. GINA protects individuals against discrimination based on their genetic information in health coverage and in employment.[0m
[32;1m[1;3mThought: I now know the final answer.

Final Answer:

The Genetic Information Nondiscrimination Act (GINA) was signed into law in 2008 to protect individuals against discrimination based on their genetic information in health coverage and employment. However, it is unclear whether there will be a specific anti-discrimination law enacted to protect U.S. federal employees who have been genetically-edited or screened as embryos by 2100.[0m

[1m> Finished chain.[0m
Question (M10028):

Will t

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='w210-openai.openai.azure.com', port=443): Read timed out. (read timeout=20.0).


[32;1m[1;3mThought: I now know the final answer.

Final Answer:

The US Supplemental Poverty Measure is a measure of poverty that takes into account additional expenses such as healthcare and housing costs. It is released annually by the US Census Bureau. While it is impossible to predict with certainty whether the measure will be higher in 2022 than 2021, there are a few factors to consider. The COVID-19 pandemic has had a significant impact on the economy and many people have lost their jobs or had reduced income. Additionally, inflation has been on the rise, which could also contribute to an increase in the poverty measure. However, there are also efforts being made to address poverty through government programs such as the American Rescue Plan. Ultimately, only time will tell whether the US Supplemental Poverty Measure will be higher in 2022 than 2021.[0m

[1m> Finished chain.[0m
Question (M10459):

Will the Federal Reserve ever adopt a policy regime that implements nominal GD

## Forecast

In [476]:
# Create forecast prompt template

forecast_template = """
You are going to act as an expert forecaster. Your job is to predict an answer to any question, \
even if knowing an answer is impossible. I will give you a question and a list of choices in the format: \
A - choice 1, B - choice 2, C - choice 3, and so on. I want you to select the best answer, based on what \
you know, and tell me the corresponding letter of that choice. It is okay if you do not know the answer. \
If you aren't sure, give me your best guess. You must never refuse to answer, even if you \
don't know the answer. The only valid response is a letter corresponding with one of the choices. \
Remember, never refuse to answer, you must provide a choice. Your choice should only be a letter for \
the best answer and no other text or punctuation. Literally only
give one uppercase letter as your response, nothing else. Your answer must be
only one uppercase character long.

Here is the question:


"""



In [732]:
class Forecaster:
    """
    Take in question (and context if not baseline) and forecast the answer
    """
    def __init__(self, baseline, df, prompt:str, deployment_name, model_name,
                 temperature=.3, max_tokens=20, request_timeout=20,
                 verbose=True, agent = "zero-shot-react-description"):
        self.baseline = baseline
        self.df = df.copy()
        self.deployment_name = deployment_name
        self.model_name = model_name
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.request_timeout = request_timeout
        self.prompt = prompt
        self.llm = AzureChatOpenAI(
            deployment_name=self.deployment_name,
            model_name=self.model_name,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            request_timeout=self.request_timeout
        )
        
    def run_question(self, ac_id:str):
        """Run a single question"""
        question = self.get_question(ac_id)
        if not self.baseline:
            context = self.get_context(ac_id)
            output = self.llm([
                            HumanMessage(content=self.prompt + "\n\n" + context + "\n\n" + question)
            ])
        else:
            output = self.llm([
                            HumanMessage(content=self.prompt + "\n\n" + question)
            ])
        answer = output.content
        return answer
    
    def get_context(self, ac_id:str):
        """pull context for a given question from method run_question"""
        ac_filterid = self.df[self.df['id'] == ac_id]
        context = ac_filterid['question_context'].values[0]
        print(f"\n\nContext: \n{context}")
        return context
        
    def get_question(self, ac_id:str):
        """generate question for a given question from method run_question"""
        # sample a question using question ID
        ac_filterid = self.df[self.df['id'] == ac_id]
        question = ac_filterid['question'].astype(str).values[0] + \
                            ' ' + \
                            ac_filterid['choices_clean'].astype(str).values[0]
        print(f"Question ({ac_id}):\n\n{question}\n\n")
        return question
    
    def run_all_questions(self):
        """Gather all questions in provided df, output in list"""
        self.answer_list = []
        for idx, row in self.df.iterrows():
            ac_id = row['id']
            answer = self.run_question(ac_id)
            self.answer_list.append(answer)
        return self.answer_list
    
    def concat_answers(self):
        self.df['LLM_answers'] = self.answer_list
        return self.df
        
    def evaluate_accuracy(self):
        accuracy = np.sum(self.df['LLM_answers'] == self.df['avg_forecast']) / self.df.shape[0]
        tf = self.df[self.df['qtype'] == 't/f']
        tf_accuracy = np.sum(tf['LLM_answers'] == tf['avg_forecast']) / tf.shape[0]
        mc = self.df[self.df['qtype'] == 'mc']
        mc_accuracy = np.sum(mc['LLM_answers'] == mc['avg_forecast']) / mc.shape[0]
        print("Accuracy: {} %".format(np.round(accuracy * 100, 2)))
        print("TF Accuracy: {} %".format(np.round(tf_accuracy * 100, 2)))
        print("MC Accuracy: {} %".format(np.round(mc_accuracy * 100, 2)))
        return accuracy, tf_accuracy, mc_accuracy
    

### Baseline model

In [723]:
# num_samples = 2
# df_sample = df.sample(num_samples, random_state = 42)
fc_all_baseline = Forecaster(baseline=True, df=all_context_df, prompt=forecast_template,
                deployment_name=OPENAI_AZURE_DEPLOYMENT_NAME,
                model_name=MODEL_NAME)

In [727]:
# run baseline model
all_forecasts_baseline = fc_all_baseline.run_all_questions()

In [763]:
# add answers to df
all_forecasts_baseline_df = fc_all_baseline.concat_answers()

In [842]:
# view output
displays(all_forecasts_baseline_df)

(98, 10)

Unnamed: 0,id,question,qtype,choices_clean,answer,avg_forecast,crowd,question_context,LLM_answers,LLM_answers_clean
400,G2124,Will there be a complex coordinated terrorist attack (CCTA) in the United States either directed or inspired by a foreign terrorist organization resulting in at least five fatalities before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-17 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 19:23:43.531000+00:00', 'forecast': 0.12}, {'timestamp': '2021-09-17 19:25:31.285000+00:00', 'forecast': 0.46}, {'timestamp': '2021-09-17 20:33:36.620000+00:00', 'forecast': 0.8}, {'timestamp': '2021-09-17 21:35:15.722000+00:00', 'forecast': 0.46}, {'timestamp': '2021-09-17 21:55:13.441000+00:00', 'forecast': 0.12}, {'timestamp': '2021-09-17 23:07:39.081000+00:00', 'forecast': 0.085}, {'timestamp': '2021-...","The United States remains in a heightened threat environment for terrorism, and the possibility of a complex coordinated terrorist attack (CCTA) cannot be ruled out. However, predicting the likelihood of such an attack before September 1, 2022 is difficult due to the evolving and dynamic nature of terrorist threats. It is important for law enforcement and intelligence agencies to remain vigilant and continue to monitor potential threats from foreign terrorist organizations targeting the Unit...",B,B
401,G2125,"Before 1 September 2022, will Egypt, Ethiopia, and Sudan sign an agreement governing the filling of the Grand Ethiopian Renaissance Dam (GERD) reservoir?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-17 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 18:26:37.819000+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 19:25:36.256000+00:00', 'forecast': 0.37}, {'timestamp': '2021-09-17 23:06:56.111000+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-18 00:43:48.134000+00:00', 'forecast': 0.5750000000000001}, {'timestamp': '2021-09-18 03:35:34.999000+00:00', 'forecast': 0.65}, {'timestamp': '2021-09-18 05:56:19.265000+00:00', 'forecast': 0.725}, {'times...","Negotiations between Egypt, Ethiopia, and Sudan regarding the Grand Ethiopian Renaissance Dam (GERD) have been ongoing for several years. In 2015, they signed the Agreement on Declaration of Principles, in which they committed to “cooperation, equitable and reasonable” water use. However, as of now, there is no clear indication whether they will sign an agreement governing the filling of the GERD reservoir before 1 September 2022. The dispute over the GERD is part of a long-standing feud bet...",A,A
402,G2141,"Before 1 January 2023, will the Taipei Economic and Cultural Representative Office officially change its name to include the word Taiwan?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-30 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-30 17:41:11.343000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 18:11:53.121000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 18:43:52.880000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 22:24:46.609000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 07:14:45.019000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 07:21:11.577000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 ...","The Taipei Economic and Cultural Representative Office (TECRO) is the de facto embassy of Taiwan in the United States. Currently, it is officially called the Taipei Economic and Cultural Representative Office in the United States (TECRO). There have been discussions and debates about changing the name to include the word Taiwan, but no official decision has been made yet. In March 2021, a bill was introduced in the US Congress to change the name to the Taiwan Representative Office, but it ha...",B,B
403,G2164,"Will there be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's Republic of China before 1 July 2022?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-20 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-20 17:22:58.541000+00:00', 'forecast': 0.05}, {'timestamp': '2021-10-20 17:23:56.216000+00:00', 'forecast': 0.525}, {'timestamp': '2021-10-20 17:52:38.121000+00:00', 'forecast': 0.49}, {'timestamp': '2021-10-20 18:01:50.732000+00:00', 'forecast': 0.745}, {'timestamp': '2021-10-20 18:22:08.907000+00:00', 'forecast': 0.49}, {'timestamp': '2021-10-20 19:32:06.133000+00:00', 'forecast': 0.27}, {'timestamp': '202...","The India-China border conflict has been ongoing for several years, with occasional clashes between the two countries' military forces. In December 2020, there was a clash between Chinese and Indian troops along the contested border, resulting in casualties on both sides. While tensions remain high, it is impossible to predict with certainty whether there will be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's R...",B,B
404,G2169,Will Taiwan publicly accuse the People's Republic of China of flying a military aircraft over the territory of and/or the territorial waters surrounding the main island of Taiwan without its permission before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-22 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-22 17:49:28.507000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-22 19:06:30.496000+00:00', 'forecast': 0.6000000000000001}, {'timestamp': '2021-10-22 21:50:45.274000+00:00', 'forecast': 0.9}, {'timestamp': '2021-10-22 22:17:05.311000+00:00', 'forecast': 0.775}, {'timestamp': '2021-10-23 00:48:15.854000+00:00', 'forecast': 0.8}, {'timestamp': '2021-10-23 04:06:26.745000+00:00', 'forecast': 0.8}, {'timesta...","In recent news, China has been sending fighter jets towards Taiwan and flying military aircraft into Taiwan's air defense zone. However, it is unclear if Taiwan has publicly accused China of flying a military aircraft over its territory or territorial waters without permission before September 1, 2022. The relationship between Taiwan and China has been tense for decades, with China claiming Taiwan as a part of its territory and Taiwan asserting its independence. The situation remains unpredi...",B,B


### Context Model

In [741]:
# Run forecasts with context
fc_all_context = Forecaster(baseline=False, df=all_context_df, prompt=forecast_template,
                deployment_name=OPENAI_AZURE_DEPLOYMENT_NAME,
                model_name=MODEL_NAME)

In [742]:
all_forecasts_context = fc_all_context.run_all_questions()

Question (G2124):

Will there be a complex coordinated terrorist attack (CCTA) in the United States either directed or inspired by a foreign terrorist organization resulting in at least five fatalities before 1 September 2022? {'A': 'yes', 'B': 'no'}




Context: 
The United States remains in a heightened threat environment for terrorism, and the possibility of a complex coordinated terrorist attack (CCTA) cannot be ruled out. However, predicting the likelihood of such an attack before September 1, 2022 is difficult due to the evolving and dynamic nature of terrorist threats. It is important for law enforcement and intelligence agencies to remain vigilant and continue to monitor potential threats from foreign terrorist organizations targeting the United States.
Question (G2125):

Before 1 September 2022, will Egypt, Ethiopia, and Sudan sign an agreement governing the filling of the Grand Ethiopian Renaissance Dam (GERD) reservoir? {'A': 'yes', 'B': 'no'}




Context: 
Negotiations betw

Question (G2244):

Before 1 January 2023, will the US officially rejoin the JCPOA? {'A': 'yes', 'B': 'no'}




Context: 
The JCPOA, or Joint Comprehensive Plan of Action, is an agreement reached in 2015 between Iran and six other countries (including the US) aimed at limiting Iran's nuclear program in exchange for lifting economic sanctions. In May 2018, the US withdrew from the JCPOA under the Trump administration. Since then, there have been discussions about the US potentially rejoining the agreement. However, as of now, it is unclear whether the US will officially rejoin the JCPOA before 1 January 2023.
Question (G2249):

Before 1 January 2023, will the Nord Stream 2 pipeline begin delivering natural gas to Germany? {'A': 'yes', 'B': 'no'}




Context: 
The Nord Stream 2 pipeline was completed in September 2021, but has not yet entered service. There is political controversy surrounding the pipeline due to concerns that Russia may use it for geopolitical advantage with Europe and U

Question (G2314):

Between 11 March 2022 and 9 September 2022, will Russia publicly announce that it has moved nuclear weapons into either Belarus or Ukraine? {'A': 'yes', 'B': 'no'}




Context: 
In recent news, Russia has signed a deal with Belarus regarding the storage of nuclear weapons. However, there is no current information indicating that Russia has publicly announced a movement of nuclear weapons into either Belarus or Ukraine between 11 March 2022 and 9 September 2022.
Question (G2319):

When will Russia and Ukraine sign or announce an agreement to end the current conflict in Ukraine? {'A': 'Before 1 June 2022', 'B': 'Between 1 June 2022 and 31 July 2022', 'C': 'Between 1 August 2022 and 30 September 2022', 'D': 'Between 1 October 2022 and 30 November 2022', 'E': 'Not before 1 December 2022'}




Context: 
The conflict in Ukraine began in 2014 when Russia annexed Crimea and has since escalated into a full-blown war. There have been multiple attempts to negotiate a peace agre

Question (G2362):

What will be the ending stocks of crude oil in the US Strategic Petroleum Reserve (SPR) in December 2022? {'A': 'Less than 340,000 thousand barrels', 'B': 'Between 340,000 thousand barrels and 400,000 thousand barrels, inclusive', 'C': 'More than 400,000 thousand barrels but less than 460,000 thousand barrels', 'D': 'Between 460,000 thousand barrels and 520,000 thousand barrels, inclusive', 'E': 'More than 520,000 thousand barrels but less than 580,000 thousand barrels', 'F': '580,000 thousand barrels or more'}




Context: 
The US Strategic Petroleum Reserve (SPR) is a stockpile of crude oil maintained by the US government as a safeguard against oil supply disruptions. As of September 2021, the SPR held approximately 621 million barrels of crude oil. The US Department of Energy is responsible for managing the SPR and regularly releases reports on its inventory levels. However, it is difficult to predict the exact ending stocks of crude oil in the SPR in December 202

Question (M980):

Will Tesla become the world's largest motor vehicle producer in some calendar year prior to 2035? {'A': 'yes', 'B': 'no'}




Context: 
Tesla is currently one of the largest motor vehicle producers in the world, delivering over 200,000 vehicles each quarter. While it is unclear if they will become the largest producer in the world prior to 2035, they do have plans to significantly ramp up production in the future. In addition, Tesla has been a major player in the electric vehicle market, which is expected to continue growing in the coming years.
Question (M1634):

At any point before 01 January 2025, will the gross income of a US household at the 80th percentile be more than 6.12414 times that of a US household at the 20th percentile; or, in other words, will this specific measurement of household income inequality increase by at least 20%? {'A': 'yes', 'B': 'no'}




Context: 
According to the available information, it is unclear whether the gross income of a US hous

Question (M5174):

If Joe Biden is elected president of the US in 2020, will the highest tax bracket be restored to its original 39.6% or higher before 2025? {'A': 'yes', 'B': 'no'}




Context: 
Under the Biden administration's proposed tax plan, the highest tax bracket would be restored to its original rate of 39.6%, up from the current rate of 37%. This change would apply to single filers making more than $400,000 and married couples with income exceeding $450,000 per year. Additionally, the administration has proposed taxing capital gains at the new top marginal income tax rate of 39.6% (plus the 5% Net Investment Income Tax) for taxpayers whose income exceeds $1 million.
Question (M5716):

Longbets series: will the amount of geologically-derived crude oil consumed by the United States in 2035 be greater than the amount consumed in 2015? {'A': 'yes', 'B': 'no'}




Context: 
According to my research, it is difficult to predict with certainty whether the amount of geologically-deriv

Question (M8299):

Will the Fed decrease the size of its accumulated asset portfolio below $8 trillion by 2023? {'A': 'yes', 'B': 'no'}




Context: 
The Federal Reserve's asset portfolio has been a topic of discussion in recent years. As of September 2021, the total assets of the Federal Reserve were over $8.3 trillion. While there have been discussions about potentially decreasing the size of the portfolio, there is no clear plan or timeline for doing so. Therefore, it is difficult to predict whether the Fed will decrease the size of its accumulated asset portfolio below $8 trillion by 2023.
Question (M8313):

Will Sam Bankman-Fried be the richest person in the world by 2050? {'A': 'yes', 'B': 'no'}




Context: 
Sam Bankman-Fried is a billionaire and the founder of FTX, a cryptocurrency exchange. He is known for his philanthropy and has pledged to donate a large portion of his wealth to effective charities. However, predicting who will be the richest person in the world by 2050 is d

Question (M8968):

Will the Federal Reserve set a target policy rate that is negative by 2050? {'A': 'yes', 'B': 'no'}




Context: 
The Federal Reserve is the central bank of the United States and is responsible for setting monetary policy. The policy rate is the interest rate at which banks can borrow money from the Federal Reserve. Negative interest rates occur when the policy rate is below zero, meaning that banks are charged to hold reserves at the central bank. 

As of now, the Federal Reserve has not indicated any plans to set a negative policy rate. In fact, the Federal Reserve has previously stated that it does not see negative interest rates as an effective tool for stimulating economic growth. However, it is difficult to predict what the policy rate will be in 2050 as it will depend on a variety of economic factors and events that may occur in the future. 

Overall, it is currently unlikely that the Federal Reserve will set a negative policy rate by 2050, but it is impossibl

Question (M10459):

Will the Federal Reserve ever adopt a policy regime that implements nominal GDP targeting or nominal wage targeting? {'A': 'yes', 'B': 'no'}




Context: 
The Federal Reserve is the central bank of the United States and is responsible for implementing monetary policy. Nominal GDP targeting is a policy regime where the central bank targets a specific level of nominal GDP growth, while nominal wage targeting is a policy regime where the central bank targets a specific level of nominal wage growth. 

There has been ongoing debate among economists and policymakers about the effectiveness of these policy regimes, and whether the Federal Reserve should adopt them. Some argue that nominal GDP targeting could help stabilize the economy and prevent recessions, while others argue that it could be difficult to implement in practice. Similarly, some argue that nominal wage targeting could help reduce income inequality and boost wage growth, while others argue that it could lead

In [743]:
all_forecasts_context_df = fc_all_context.concat_answers()

In [744]:
displays(all_forecasts_context_df, 20)

(98, 9)

Unnamed: 0,id,question,qtype,choices_clean,answer,avg_forecast,crowd,question_context,LLM_answers
400,G2124,Will there be a complex coordinated terrorist attack (CCTA) in the United States either directed or inspired by a foreign terrorist organization resulting in at least five fatalities before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-17 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 19:23:43.531000+00:00', 'forecast': 0.12}, {'timestamp': '2021-09-17 19:25:31.285000+00:00', 'forecast': 0.46}, {'timestamp': '2021-09-17 20:33:36.620000+00:00', 'forecast': 0.8}, {'timestamp': '2021-09-17 21:35:15.722000+00:00', 'forecast': 0.46}, {'timestamp': '2021-09-17 21:55:13.441000+00:00', 'forecast': 0.12}, {'timestamp': '2021-09-17 23:07:39.081000+00:00', 'forecast': 0.085}, {'timestamp': '2021-...","The United States remains in a heightened threat environment for terrorism, and the possibility of a complex coordinated terrorist attack (CCTA) cannot be ruled out. However, predicting the likelihood of such an attack before September 1, 2022 is difficult due to the evolving and dynamic nature of terrorist threats. It is important for law enforcement and intelligence agencies to remain vigilant and continue to monitor potential threats from foreign terrorist organizations targeting the Unit...",B
401,G2125,"Before 1 September 2022, will Egypt, Ethiopia, and Sudan sign an agreement governing the filling of the Grand Ethiopian Renaissance Dam (GERD) reservoir?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-17 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 18:26:37.819000+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 19:25:36.256000+00:00', 'forecast': 0.37}, {'timestamp': '2021-09-17 23:06:56.111000+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-18 00:43:48.134000+00:00', 'forecast': 0.5750000000000001}, {'timestamp': '2021-09-18 03:35:34.999000+00:00', 'forecast': 0.65}, {'timestamp': '2021-09-18 05:56:19.265000+00:00', 'forecast': 0.725}, {'times...","Negotiations between Egypt, Ethiopia, and Sudan regarding the Grand Ethiopian Renaissance Dam (GERD) have been ongoing for several years. In 2015, they signed the Agreement on Declaration of Principles, in which they committed to “cooperation, equitable and reasonable” water use. However, as of now, there is no clear indication whether they will sign an agreement governing the filling of the GERD reservoir before 1 September 2022. The dispute over the GERD is part of a long-standing feud bet...",B
402,G2141,"Before 1 January 2023, will the Taipei Economic and Cultural Representative Office officially change its name to include the word Taiwan?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-30 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-30 17:41:11.343000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 18:11:53.121000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 18:43:52.880000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 22:24:46.609000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 07:14:45.019000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 07:21:11.577000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 ...","The Taipei Economic and Cultural Representative Office (TECRO) is the de facto embassy of Taiwan in the United States. Currently, it is officially called the Taipei Economic and Cultural Representative Office in the United States (TECRO). There have been discussions and debates about changing the name to include the word Taiwan, but no official decision has been made yet. In March 2021, a bill was introduced in the US Congress to change the name to the Taiwan Representative Office, but it ha...",A
403,G2164,"Will there be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's Republic of China before 1 July 2022?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-20 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-20 17:22:58.541000+00:00', 'forecast': 0.05}, {'timestamp': '2021-10-20 17:23:56.216000+00:00', 'forecast': 0.525}, {'timestamp': '2021-10-20 17:52:38.121000+00:00', 'forecast': 0.49}, {'timestamp': '2021-10-20 18:01:50.732000+00:00', 'forecast': 0.745}, {'timestamp': '2021-10-20 18:22:08.907000+00:00', 'forecast': 0.49}, {'timestamp': '2021-10-20 19:32:06.133000+00:00', 'forecast': 0.27}, {'timestamp': '202...","The India-China border conflict has been ongoing for several years, with occasional clashes between the two countries' military forces. In December 2020, there was a clash between Chinese and Indian troops along the contested border, resulting in casualties on both sides. While tensions remain high, it is impossible to predict with certainty whether there will be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's R...",B
404,G2169,Will Taiwan publicly accuse the People's Republic of China of flying a military aircraft over the territory of and/or the territorial waters surrounding the main island of Taiwan without its permission before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-22 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-22 17:49:28.507000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-22 19:06:30.496000+00:00', 'forecast': 0.6000000000000001}, {'timestamp': '2021-10-22 21:50:45.274000+00:00', 'forecast': 0.9}, {'timestamp': '2021-10-22 22:17:05.311000+00:00', 'forecast': 0.775}, {'timestamp': '2021-10-23 00:48:15.854000+00:00', 'forecast': 0.8}, {'timestamp': '2021-10-23 04:06:26.745000+00:00', 'forecast': 0.8}, {'timesta...","In recent news, China has been sending fighter jets towards Taiwan and flying military aircraft into Taiwan's air defense zone. However, it is unclear if Taiwan has publicly accused China of flying a military aircraft over its territory or territorial waters without permission before September 1, 2022. The relationship between Taiwan and China has been tense for decades, with China claiming Taiwan as a part of its territory and Taiwan asserting its independence. The situation remains unpredi...",B
405,G2171,"Between 30 October 2021 and 31 December 2022, will the Council of the European Union impose new restrictive measures (sanctions) on China over human rights violations and abuses in Xinjiang?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-29 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-29 17:08:22.020000+00:00', 'forecast': 0.14}, {'timestamp': '2021-10-29 19:39:03.259000+00:00', 'forecast': 0.105}, {'timestamp': '2021-10-29 21:16:57.851000+00:00', 'forecast': 0.14}, {'timestamp': '2021-10-30 07:53:59.731000+00:00', 'forecast': 0.105}, {'timestamp': '2021-10-30 12:58:35.807000+00:00', 'forecast': 0.07}, {'timestamp': '2021-10-30 21:18:51.267000+00:00', 'forecast': 0.06}, {'timestamp': '202...","The EU has previously imposed human rights sanctions on Chinese officials involved in abuses in Xinjiang, and there have been discussions of potential new sanctions in response to continued human rights violations. However, it is unclear if the Council of the European Union will impose new restrictive measures between 30 October 2021 and 31 December 2022. Human rights dialogues between the EU and China have been suspended since 2019, and there is ongoing tension between the two entities rega...",A
407,G2176,"Before 1 January 2023, will China and/or a host country officially announce an agreement for the establishment of a Chinese military base in an African country besides Djibouti?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-29 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-29 20:44:50.280000+00:00', 'forecast': 0.25}, {'timestamp': '2021-10-30 04:45:42.001000+00:00', 'forecast': 0.165}, {'timestamp': '2021-10-30 09:40:37.449000+00:00', 'forecast': 0.08}, {'timestamp': '2021-10-30 13:01:04.970000+00:00', 'forecast': 0.065}, {'timestamp': '2021-10-30 16:55:54.506000+00:00', 'forecast': 0.05}, {'timestamp': '2021-10-30 20:25:07.831000+00:00', 'forecast': 0.065}, {'timestamp': '20...","China has been increasing its presence in Africa in recent years,\nincluding establishing a military base in Djibouti in 2017.\nThere have been reports of China seeking to establish additional\nmilitary bases in Africa, but as of yet, no official announcements\nhave been made regarding such agreements. It is difficult to predict\nwhether such an announcement will be made before 1 January 2023, as\nit depends on a variety of factors including political and economic\nrelations between China an...",B
409,G2201,"Before 1 January 2023, will Taiwan publicly accuse the People's Republic of China of landing military personnel on the Pratas Islands without authorization?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-11-19 22:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-11-20 00:57:30.135000+00:00', 'forecast': 0.11}, {'timestamp': '2021-11-20 03:32:43.391000+00:00', 'forecast': 0.06}, {'timestamp': '2021-11-20 05:07:33.165000+00:00', 'forecast': 0.09}, {'timestamp': '2021-11-20 08:23:00.148000+00:00', 'forecast': 0.05}, {'timestamp': '2021-11-20 08:34:02.648000+00:00', 'forecast': 0.09}, {'timestamp': '2021-11-20 09:52:27.073000+00:00', 'forecast': 0.05}, {'timestamp': '2021-...","The Pratas Islands, also known as the Dongsha Islands, are a group of atolls in the northern part of the South China Sea. The islands are currently administered by Taiwan, but are also claimed by China. Tensions between Taiwan and China have been high in recent years, with China increasing its military presence in the region. In March 2021, Taiwan reported that 20 Chinese military aircraft had entered its air defense identification zone. In the past, Taiwan has accused China of landing milit...",B
412,G2238,What will be the FAO Food Price Index for June 2022?,mc,"{'A': 'Less than 120.0', 'B': 'Between 120.0 and 130.0, inclusive', 'C': 'More than 130.0 but less than 140.0', 'D': 'Between 140.0 and 150.0, inclusive', 'E': 'More than 150.0'}",,C,"[{'timestamp': '2022-01-07 18:00:00+00:00', 'forecast': [0.2, 0.2, 0.2, 0.2, 0.2]}, {'timestamp': '2022-01-07 18:26:12.988000+00:00', 'forecast': [0.06, 0.395, 0.4, 0.135, 0.01]}, {'timestamp': '2022-01-07 18:53:08.092000+00:00', 'forecast': [0.08, 0.36, 0.39, 0.16, 0.01]}, {'timestamp': '2022-01-07 19:50:57.487000+00:00', 'forecast': [0.0, 0.37, 0.39, 0.21, 0.03]}, {'timestamp': '2022-01-07 20:28:23.693000+00:00', 'forecast': [0.0, 0.37, 0.36, 0.21, 0.06]}, {'timestamp': '2022-01-07 20:53:3...","The FAO Food Price Index is a measure of the monthly change in international prices of a basket of food commodities. It is calculated by the Food and Agriculture Organization of the United Nations (FAO). The index is based on the average of five commodity group price indices (cereals, vegetable oils, dairy, meat, and sugar). The June 2022 index has not yet been released, as it is a future date. Therefore, it is not possible to predict what the index will be for that month.",F - It is not possible to predict the FAO Food Price Index for June 2022.
413,G2240,Will the powers of the government of Myanmar cease to be held by the military before 1 October 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2022-01-07 18:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2022-01-07 18:30:23.340000+00:00', 'forecast': 0.02}, {'timestamp': '2022-01-07 18:36:28.716000+00:00', 'forecast': 0.005}, {'timestamp': '2022-01-07 19:04:31.630000+00:00', 'forecast': 0.0}, {'timestamp': '2022-01-07 19:50:10.672000+00:00', 'forecast': 0.0}, {'timestamp': '2022-01-07 20:44:41.543000+00:00', 'forecast': 0.0}, {'timestamp': '2022-01-07 21:39:50.466000+00:00', 'forecast': 0.04}, {'timestamp': '2022-01...","As of current information, it is unlikely that the powers of the government of Myanmar will cease to be held by the military before 1 October 2022. The military regime has extended its emergency rule until 2023, and the country remains riven by internal fighting and struggles for political control.",B


In [829]:
# clean forecast answers
def clean_forecasts(row):
    answer = row['LLM_answers']
    if pd.notna(answer) and isinstance(answer, str):
        answer = answer.upper()
        len_choices = len(row['choices_clean'])
        letters = [chr(ord('A') + i) for i in range(len_choices)]
        # if one-letter answer satisfactory
        if len(answer) == 1 and answer in letters:
            return answer
        # if long answer satisfactory
        if len(answer) > 1 and answer[0] in letters:
            return answer[0].upper()
        # anything else
        return np.nan
all_forecasts_context_df['LLM_answers_clean'] = all_forecasts_context_df.apply(clean_forecasts, axis=1)
all_forecasts_baseline_df['LLM_answers_clean'] = all_forecasts_baseline_df.apply(clean_forecasts, axis=1)

In [843]:
displays(all_forecasts_context_df)

(98, 10)

Unnamed: 0,id,question,qtype,choices_clean,answer,avg_forecast,crowd,question_context,LLM_answers,LLM_answers_clean
400,G2124,Will there be a complex coordinated terrorist attack (CCTA) in the United States either directed or inspired by a foreign terrorist organization resulting in at least five fatalities before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-17 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 19:23:43.531000+00:00', 'forecast': 0.12}, {'timestamp': '2021-09-17 19:25:31.285000+00:00', 'forecast': 0.46}, {'timestamp': '2021-09-17 20:33:36.620000+00:00', 'forecast': 0.8}, {'timestamp': '2021-09-17 21:35:15.722000+00:00', 'forecast': 0.46}, {'timestamp': '2021-09-17 21:55:13.441000+00:00', 'forecast': 0.12}, {'timestamp': '2021-09-17 23:07:39.081000+00:00', 'forecast': 0.085}, {'timestamp': '2021-...","The United States remains in a heightened threat environment for terrorism, and the possibility of a complex coordinated terrorist attack (CCTA) cannot be ruled out. However, predicting the likelihood of such an attack before September 1, 2022 is difficult due to the evolving and dynamic nature of terrorist threats. It is important for law enforcement and intelligence agencies to remain vigilant and continue to monitor potential threats from foreign terrorist organizations targeting the Unit...",B,B
401,G2125,"Before 1 September 2022, will Egypt, Ethiopia, and Sudan sign an agreement governing the filling of the Grand Ethiopian Renaissance Dam (GERD) reservoir?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-17 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 18:26:37.819000+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-17 19:25:36.256000+00:00', 'forecast': 0.37}, {'timestamp': '2021-09-17 23:06:56.111000+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-18 00:43:48.134000+00:00', 'forecast': 0.5750000000000001}, {'timestamp': '2021-09-18 03:35:34.999000+00:00', 'forecast': 0.65}, {'timestamp': '2021-09-18 05:56:19.265000+00:00', 'forecast': 0.725}, {'times...","Negotiations between Egypt, Ethiopia, and Sudan regarding the Grand Ethiopian Renaissance Dam (GERD) have been ongoing for several years. In 2015, they signed the Agreement on Declaration of Principles, in which they committed to “cooperation, equitable and reasonable” water use. However, as of now, there is no clear indication whether they will sign an agreement governing the filling of the GERD reservoir before 1 September 2022. The dispute over the GERD is part of a long-standing feud bet...",B,B
402,G2141,"Before 1 January 2023, will the Taipei Economic and Cultural Representative Office officially change its name to include the word Taiwan?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-09-30 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-09-30 17:41:11.343000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 18:11:53.121000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 18:43:52.880000+00:00', 'forecast': 1.0}, {'timestamp': '2021-09-30 22:24:46.609000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 07:14:45.019000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 07:21:11.577000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-01 ...","The Taipei Economic and Cultural Representative Office (TECRO) is the de facto embassy of Taiwan in the United States. Currently, it is officially called the Taipei Economic and Cultural Representative Office in the United States (TECRO). There have been discussions and debates about changing the name to include the word Taiwan, but no official decision has been made yet. In March 2021, a bill was introduced in the US Congress to change the name to the Taiwan Representative Office, but it ha...",A,A
403,G2164,"Will there be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's Republic of China before 1 July 2022?",t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-20 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-20 17:22:58.541000+00:00', 'forecast': 0.05}, {'timestamp': '2021-10-20 17:23:56.216000+00:00', 'forecast': 0.525}, {'timestamp': '2021-10-20 17:52:38.121000+00:00', 'forecast': 0.49}, {'timestamp': '2021-10-20 18:01:50.732000+00:00', 'forecast': 0.745}, {'timestamp': '2021-10-20 18:22:08.907000+00:00', 'forecast': 0.49}, {'timestamp': '2021-10-20 19:32:06.133000+00:00', 'forecast': 0.27}, {'timestamp': '202...","The India-China border conflict has been ongoing for several years, with occasional clashes between the two countries' military forces. In December 2020, there was a clash between Chinese and Indian troops along the contested border, resulting in casualties on both sides. While tensions remain high, it is impossible to predict with certainty whether there will be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's R...",B,B
404,G2169,Will Taiwan publicly accuse the People's Republic of China of flying a military aircraft over the territory of and/or the territorial waters surrounding the main island of Taiwan without its permission before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",,B,"[{'timestamp': '2021-10-22 17:00:00+00:00', 'forecast': 0.5}, {'timestamp': '2021-10-22 17:49:28.507000+00:00', 'forecast': 1.0}, {'timestamp': '2021-10-22 19:06:30.496000+00:00', 'forecast': 0.6000000000000001}, {'timestamp': '2021-10-22 21:50:45.274000+00:00', 'forecast': 0.9}, {'timestamp': '2021-10-22 22:17:05.311000+00:00', 'forecast': 0.775}, {'timestamp': '2021-10-23 00:48:15.854000+00:00', 'forecast': 0.8}, {'timestamp': '2021-10-23 04:06:26.745000+00:00', 'forecast': 0.8}, {'timesta...","In recent news, China has been sending fighter jets towards Taiwan and flying military aircraft into Taiwan's air defense zone. However, it is unclear if Taiwan has publicly accused China of flying a military aircraft over its territory or territorial waters without permission before September 1, 2022. The relationship between Taiwan and China has been tense for decades, with China claiming Taiwan as a part of its territory and Taiwan asserting its independence. The situation remains unpredi...",B,B


## Coco's forecasts

Testing Coco's as well--Coco and my parsed crowd forecasts have slightly different results.

In [670]:
# structure coco's forecasts
def clean_coco_forecast(item):
    """lambda function to convert coco's aggregated forecasts to format of original 'crowd' column"""
    forecast_list = []
    forecast_dict = {"forecast": item}
    forecast_list.append(forecast_dict)
    return str(forecast_list)

In [693]:
# coco's calculated crowd forecast predictions
ac_coco = pd.read_csv("Data/Autocast/filtered_events_20230709.csv")
ac_coco_cols = ["id", "question", "qtype", "status", "choices", "answer"]
ac_coco_clean = ac_coco.loc[:,ac_coco_cols]
ac_coco_clean['crowd'] = ac_coco.loc[:,'avg_pred'].apply(lambda x: str([{"forecast": x}]))
ac_coco_clean.to_csv("Data/Autocast/filtered_events_20230709_coco_clean.csv", index=False)

In [695]:
displays(ac_coco_clean)

(1232, 7)

Unnamed: 0,id,question,qtype,status,choices,answer,crowd
0,G7,Will Iran release Jason Rezaian before 31 October 2016?,t/f,Resolved,"['yes', 'no']",yes,[{'forecast': 0.6094309927360778}]
1,G8,"Will North Korea launch a land based missile with the capacity to reach Alaska, Hawaii, or the continental United States before 1 January 2017?",t/f,Resolved,"['yes', 'no']",yes,[{'forecast': 0.3293603133159254}]
2,G11,Will Congress pass a resolution disapproving the Joint Comprehensive Plan of Action?,mc,Resolved,"['No', 'Yes, but the resolution will be vetoed by the President and the veto will stand', 'Yes, and the resolution will become law']",A,[{'forecast': 0.3333332233223311}]
3,G15,Will Bashar al-Assad cease to be President of Syria before 1 March 2017?,t/f,Resolved,"['yes', 'no']",no,[{'forecast': 0.1684135667396055}]
4,G17,Will Iran's President Hassan Rouhani meet Saudi Arabia's King Salman bin Abdulaziz Al Saud before 1 September 2016?,t/f,Resolved,"['yes', 'no']",no,[{'forecast': 0.0377990430622009}]


In [696]:
# run Coco's avg_forecast through AutocastBuilder()
data_path_coco = "Data/Autocast/filtered_events_20230709_coco_clean.csv"

ab_coco = AutocastBuilder(data_path=data_path_coco)
ab_coco_df = ab_coco.data

  return mode(answer_choices, keepdims=True)[0][0][0]


In [758]:
# # testing MC answers
# def tmp(row):
#     #print(row.name)
#     """returns aggregate forecasted answer from all forecasts"""
#     crowd = ast.literal_eval(row['crowd'])
#     print(row['id'])
#     t_f = True if (len(row['choices_clean']) == 2) \
#         and (row['choices_clean']['A'] == 'yes') \
#         and (row['choices_clean']['B'] == 'no') else False
# #     chr(ord('A') + int(len(row['choices_clean'])))
#     if not t_f:
#         forecasts = [[np.nan] if isinstance(forecast['forecast'], list) else forecast['forecast'] for forecast in crowd]
# #         forecasts = [forecast if len(forecast) == 1 else np.nan for forecast in forecasts]
#         print(forecasts)
#         print("\n\n")
#     #     single_forecast = [forecast if pd.notna(forecast) else "X" for forecast in forecasts]
#         #print(f"n_choices: {n_choices}")
#     #     forecasted_answer = AutocastBuilder.get_mode_answer(forecasts, n_choices, t_f=t_f)
#     #     return single_forecast
#     else:
#         pass
# all_forecasts_df[all_forecasts_df['qtype'] == 'mc'].apply(tmp, axis=1)

## Evaluation

In [830]:
# Accuracy metrics
# Note: all_forecasts_baseline_df has shifted mc answers, so only using all_forecasts_context_df's avg_forecasts
all_answers_df = all_forecasts_context_df[[
    "id", "question", "qtype", "choices_clean", "avg_forecast", "LLM_answers_clean"
]].rename(columns={"LLM_answers_clean": "LLM_answers_context"})\
.merge(
    all_forecasts_baseline_df[[
        "id", "LLM_answers_clean"
    ]].rename(columns={"LLM_answers_clean": "LLM_answers_baseline"}),
    how='inner', on=["id"]
)\
.merge(
    ab_coco_df[[
        "id", "avg_forecast"
    ]].rename(columns={"avg_forecast": "avg_forecast_coco"}),
    how='inner', on=["id"]
)[[
    "id", "question", "qtype", "choices_clean", 
    "avg_forecast", "avg_forecast_coco", "LLM_answers_baseline", "LLM_answers_context"
]]

In [844]:
# display what answers look like
display(all_answers_df)

Unnamed: 0,id,question,qtype,choices_clean,avg_forecast,avg_forecast_coco,LLM_answers_baseline,LLM_answers_context
0,G2124,Will there be a complex coordinated terrorist attack (CCTA) in the United States either directed or inspired by a foreign terrorist organization resulting in at least five fatalities before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",B,B,B,B
1,G2125,"Before 1 September 2022, will Egypt, Ethiopia, and Sudan sign an agreement governing the filling of the Grand Ethiopian Renaissance Dam (GERD) reservoir?",t/f,"{'A': 'yes', 'B': 'no'}",B,B,A,B
2,G2141,"Before 1 January 2023, will the Taipei Economic and Cultural Representative Office officially change its name to include the word Taiwan?",t/f,"{'A': 'yes', 'B': 'no'}",B,B,B,A
3,G2164,"Will there be a lethal confrontation between the national military forces, militia, and/or law enforcement personnel of India and the People's Republic of China before 1 July 2022?",t/f,"{'A': 'yes', 'B': 'no'}",B,B,B,B
4,G2169,Will Taiwan publicly accuse the People's Republic of China of flying a military aircraft over the territory of and/or the territorial waters surrounding the main island of Taiwan without its permission before 1 September 2022?,t/f,"{'A': 'yes', 'B': 'no'}",B,B,B,B
5,G2171,"Between 30 October 2021 and 31 December 2022, will the Council of the European Union impose new restrictive measures (sanctions) on China over human rights violations and abuses in Xinjiang?",t/f,"{'A': 'yes', 'B': 'no'}",B,B,A,A
6,G2176,"Before 1 January 2023, will China and/or a host country officially announce an agreement for the establishment of a Chinese military base in an African country besides Djibouti?",t/f,"{'A': 'yes', 'B': 'no'}",B,B,B,B
7,G2201,"Before 1 January 2023, will Taiwan publicly accuse the People's Republic of China of landing military personnel on the Pratas Islands without authorization?",t/f,"{'A': 'yes', 'B': 'no'}",B,B,A,B
8,G2238,What will be the FAO Food Price Index for June 2022?,mc,"{'A': 'Less than 120.0', 'B': 'Between 120.0 and 130.0, inclusive', 'C': 'More than 130.0 but less than 140.0', 'D': 'Between 140.0 and 150.0, inclusive', 'E': 'More than 150.0'}",C,B,C,
9,G2240,Will the powers of the government of Myanmar cease to be held by the military before 1 October 2022?,t/f,"{'A': 'yes', 'B': 'no'}",B,B,B,B


In [833]:
def evaluate_accuracy(df, crowd_forecast, llm_forecast, name):
    df = df.copy()
    accuracy = np.sum(df[llm_forecast] == df[crowd_forecast]) / df.shape[0]
    tf = df[df['qtype'] == 't/f']
    tf_accuracy = np.sum(tf[llm_forecast] == tf[crowd_forecast]) / tf.shape[0]
    mc = df[df['qtype'] == 'mc']
    mc_accuracy = np.sum(mc[llm_forecast] == mc[crowd_forecast]) / mc.shape[0]
    print("Accuracy: {} %".format(np.round(accuracy * 100, 2)))
    print("TF Accuracy: {} %".format(np.round(tf_accuracy * 100, 2)))
    print("MC Accuracy: {} %".format(np.round(mc_accuracy * 100, 2)))
    answer_df = pd.DataFrame({
        "accuracy": accuracy,
        "tf_accuracy": tf_accuracy,
        "mc_accuracy": mc_accuracy
    }, index=[name])
    return answer_df

In [834]:
# greg vs. baseline
greg_baseline = evaluate_accuracy(all_answers_df, "avg_forecast", "LLM_answers_baseline", "Greg_baseline")

Accuracy: 59.18 %
TF Accuracy: 67.86 %
MC Accuracy: 7.14 %


In [835]:
# greg vs. context
greg_context = evaluate_accuracy(all_answers_df, "avg_forecast", "LLM_answers_context", "Greg_context")

Accuracy: 62.24 %
TF Accuracy: 66.67 %
MC Accuracy: 35.71 %


In [836]:
# coco vs. baseline
coco_baseline = evaluate_accuracy(all_answers_df, "avg_forecast_coco", "LLM_answers_baseline", "Coco_baseline")

Accuracy: 58.16 %
TF Accuracy: 66.67 %
MC Accuracy: 7.14 %


In [837]:
# coco vs. context
coco_context = evaluate_accuracy(all_answers_df, "avg_forecast_coco", "LLM_answers_context", "Coco_context")

Accuracy: 60.2 %
TF Accuracy: 67.86 %
MC Accuracy: 14.29 %


In [838]:
# final metrics
pd.concat([
    greg_baseline, greg_context, coco_baseline, coco_context
])

Unnamed: 0,accuracy,tf_accuracy,mc_accuracy
Greg_baseline,0.591837,0.678571,0.071429
Greg_context,0.622449,0.666667,0.357143
Coco_baseline,0.581633,0.666667,0.071429
Coco_context,0.602041,0.678571,0.142857
