In [1]:
import os, pandas as pd, openai
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from gpt4_utils import create_message, get_completion, extract_model_answers1, extract_model_answers2

# load api key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# set global params
MODEL = 'gpt-4-0613'

THRESH = 7350 if (MODEL in ['gpt-4-0314', 'gpt-4-0613']) else 3350

PRICE = {'gpt-3.5-turbo-0301': 0.002 / 1000,
         'gpt-4-0613_input': 0.03 / 1000,
         'gpt-4-0613_output': 0.06 / 1000,
         'text-davinci-003': 0.02 / 1000}

In [2]:
PATH_CONF_v2  = Path('..', 'data', 'news_corpus_conf_v2.feather')
news = pd.read_feather(PATH_CONF_v2)

### 1. Round: General Assessment & Summary

In [3]:
PATH_PROMPT_SYS  = Path('..', 'data', 'openai-prompts', '20230628_sys.txt')
PATH_PROMPT_USER = Path('..', 'data', 'openai-prompts', '20230628.txt')

In [4]:
news = \
    pd.concat([
        news,
        pd.DataFrame(
            news.apply(lambda x:
                create_message(p_sys=PATH_PROMPT_SYS, p_user=PATH_PROMPT_USER, common_names=x['common_names'], title=x['title'], body=x['body'], model=MODEL, thresh=THRESH),
                axis=1).tolist(),
            columns=['messages', 'trimmed', 'n_tok']
        )
    ], axis=1)

print(f"""
Model checkpoint:\t{MODEL}
Est. tokens (prompt):\t{news['n_tok'].sum()}
Est. cost (prompt):\t{round((news['n_tok'].sum() * PRICE[f'{MODEL}_input']), 2)}
Est. tokens (output):\t{len(news) * 120}
Est. cost (prompt):\t{round((len(news) * 120 * PRICE[f'{MODEL}_output']), 2)}
""")


Model checkpoint:	gpt-4-0613
Est. tokens (prompt):	3473142
Est. cost (prompt):	104.19
Est. tokens (output):	372240
Est. cost (prompt):	22.33



In [5]:
tqdm.pandas()
news['response'] = news.progress_apply(lambda x: get_completion(x['messages'], x['an'], model=MODEL, p_user=PATH_PROMPT_USER), axis=1)

  0%|          | 0/3102 [00:00<?, ?it/s]

100%|██████████| 3102/3102 [00:06<00:00, 496.35it/s]


In [6]:
news = \
    pd.concat([
        news,
        pd.DataFrame(
            news['response'].map(lambda x: extract_model_answers1(x)).tolist(),
            columns=['e1', 'a1', 'e2', 'a2']
        )
    ], axis=1)

### 2. Round: Event Classification

In [7]:
PATH_PROMPT_SYS  = Path('..', 'data', 'openai-prompts', '20230704_sys.txt')
PATH_PROMPT_USER = Path('..', 'data', 'openai-prompts', '20230704.txt')

In [8]:
news = \
    pd.concat([
        news,
        pd.DataFrame(
            news.apply(lambda x:
                create_message(p_sys=PATH_PROMPT_SYS, p_user=PATH_PROMPT_USER, body=x['e1'], model=MODEL, thresh=THRESH),
                axis=1).tolist(),
            columns=['messages2', 'trimmed2', 'n_tok2']
        )
    ], axis=1)

print(f"""
Model checkpoint:\t{MODEL}
Est. tokens (prompt):\t{news['n_tok2'].sum()}
Est. cost (prompt):\t{round((news['n_tok2'].sum() * PRICE[f'{MODEL}_input']), 2)}
Est. tokens (output):\t{len(news) * 10}
Est. cost (prompt):\t{round((len(news) * 10 * PRICE[f'{MODEL}_output']), 2)}
""")


Model checkpoint:	gpt-4-0613
Est. tokens (prompt):	850012
Est. cost (prompt):	25.5
Est. tokens (output):	31020
Est. cost (prompt):	1.86



In [9]:
tqdm.pandas()
news['response2'] = news.progress_apply(lambda x: get_completion(x['messages2'], x['an'], model=MODEL, p_user=PATH_PROMPT_USER), axis=1)

100%|██████████| 3102/3102 [00:07<00:00, 434.86it/s]


In [10]:
news['codes'] = news['response2'].map(lambda x: extract_model_answers2(x))

In [11]:
PATH_CONF_OUT = Path('..', 'data', 'labels', 'labels_news_gpt.feather')
news.to_feather(PATH_CONF_OUT)

### Face Validity

In [20]:
news[news['title'] == "Semi-annual review of the VINX30 Index (33/18)"]['common_names']

6    AP Moeller - Maersk
Name: common_names, dtype: object

In [12]:
x = news[news['an'] == 'LBA0000020181205eec501rnp']

In [16]:
print(x['ric_coname'].iloc[0])

AP Moeller - Maersk A/S


In [13]:
print(x['messages'].iloc[0][1]['content'])

Here is a news article:

<article>
Title: PREVIEW-Danske investors bank on Maersk clan to chart course through crisis\n
Body: * Top investor A.P. Moller called EGM after ousting chairman

* Shareholder meeting on Dec. 7 to elect new board members

* Danske should "open up" to investors - big 10 shareholder

* Danish pension fund PFA says Danske can "move forward"

* Danske hit by $227 billion money laundering scandal

By Simon Jessop, Kirstin Ridley and Teis Jensen

LONDON/COPENHAGEN, Dec 5 (Reuters) - Danske Bank's top investors are looking to Denmark's Maersk family to steer the country's largest lender through the turmoil of a 200 billion euro ($227 billion) money laundering scandal.

The Danish clan's investment firm A.P. Moller Holding, Danske's normally passive top shareholder with a stake of around 21 percent, has ousted the bank's chairman Ole Andersen and called an extraordinary shareholder meeting in Copenhagen on Friday to nominate two successors to the board.

Given the dep

In [14]:
print(x['response'].iloc[0]['output'])

Step 1:
- Short explanation: The article discusses several stock price-relevant events. These include changes in top management at Danske Bank, with the ousting of the chairman and the nomination of new board members. The article also mentions a significant legal issue, a $227 billion money laundering scandal, which has led to regulatory investigations and a substantial drop in Danske Bank's share price. 
- Answer: Yes

Step 2:
- Short explanation: The events discussed in the article directly relate to Danske Bank. However, the Maersk family, through their investment firm A.P. Moller Holding, is involved as they are the top shareholder in Danske Bank and have taken an active role in addressing the bank's crisis. A.P. Moller Holding is related to AP Moeller - Maersk, but they are not the same entity. The article does not mention any direct impact on AP Moeller - Maersk.
- Answer: No


In [15]:
print(x['response2'].iloc[0]['output'])

a. change in the top management team
j. legal fines, lawsuit, or regulatory changes
