In [1]:
import os

import ast
from ast import literal_eval
import requests
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine

import pandas as pd
from tqdm import tqdm


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [3]:
def main(
    dataset: pd.DataFrame,
    title: str = "title",
    article: str = "article",
):

    dataset = dataset
    annotated_articles = []

    for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):

        if len(row[title].split()) > 3300 or len(row[article].split()) > 3300:
            continue

        # Perform mention detection on headline and body text
        el_title = requests.post(API_URL, json={
            "text": row[title],
            "spans": []
        }).json()
        el_article = requests.post(API_URL, json={
            "text": row[article],
            "spans": []
        }).json()

        # Filter mentions with the ORG tag
        headline_mentions_org = [
            mention for mention in el_title if mention[-1] == "ORG"
        ]
        body_text_mentions_org = [
            mention for mention in el_article if mention[-1] == "ORG"
        ]

        # Mark salient entities
        salient_entities_org = []
        for body_entity in body_text_mentions_org:
            if body_entity[3] in [
                headline_entity[3] for headline_entity in headline_mentions_org
            ]:
                salient_entities_org.append(body_entity)

        if salient_entities_org:
            salient_entities_org_set = set([entity[3] for entity in salient_entities_org])
        else:
            salient_entities_org_set = {'None'}
            
        # Save the annotated article
        annotated_articles.append(
            {
                "headline": row[title],
                "body_text": row[article],
                "headline_mentions": el_title,
                "body_text_mentions": el_article,
                "salient_entities_org": salient_entities_org,
                "salient_entities_set": salient_entities_org_set,
            }
        )

    return pd.DataFrame(annotated_articles)

In [5]:
API_URL = "http://rel:5555/api"

# Load parsed news from DB and save for labeling

In [48]:
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_pass = os.getenv('DB_PASS')

In [49]:
# Create the connection string
connection_str = f'postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'

# Create the engine
engine = create_engine(connection_str)

In [50]:
query = 'SELECT * FROM raw_news_articles'
df = pd.read_sql(query, engine)

In [51]:
df.head()

Unnamed: 0,id,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text,url_hash,date_created
0,1,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,2023-11-30 00:00:00+00:00,parsed,"[Akash Sriram Hyunjoo Jin Abhirup Roy, Akash S...",https://www.reuters.com/business/autos-transpo...,https://news.google.com/rss/articles/CBMie2h0d...,https://www.reuters.com,Reuters,True,,,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,2023-12-01 06:05:36
1,2,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,2023-11-30 00:00:00+00:00,parsed,"[Jonathan Vanian, In]",https://www.cnbc.com/2023/11/30/read-linda-yac...,https://news.google.com/rss/articles/CBMiZmh0d...,https://www.cnbc.com,CNBC,True,,,63329ae2128c913d54235a98b155a792c49b8adad89d59...,2023-12-01 06:05:38
2,3,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,2023-11-30 22:10:15+00:00,parsed,[Jill Goldsmith],https://deadline.com/2023/11/disney-reinstates...,https://news.google.com/rss/articles/CBMiaWh0d...,https://deadline.com,Deadline,True,,,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,2023-12-01 06:05:39
3,4,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,2023-11-30 00:00:00+00:00,parsed,[Phil Rosen],https://markets.businessinsider.com/news/commo...,https://news.google.com/rss/articles/CBMihQFod...,https://markets.businessinsider.com,Markets Insider,True,,,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,2023-12-01 06:05:41
4,5,Dow Jones Hits 2023 High As Salesforce Soars; ...,,NaT,,,,https://news.google.com/rss/articles/CBMijgFod...,https://www.investors.com,Investor's Business Daily,False,ArticleException,Article `download()` failed with 403 Client Er...,03cedd18dd1facb6574b7a241aa689a6401b63a6653a15...,2023-12-01 06:05:42


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7302 entries, 0 to 7301
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   id                   7302 non-null   int64              
 1   title                7302 non-null   object             
 2   text                 4751 non-null   object             
 3   publish_date         4751 non-null   datetime64[ns, UTC]
 4   publish_date_source  4751 non-null   object             
 5   authors              4751 non-null   object             
 6   canonical_link       4751 non-null   object             
 7   feed_link            7302 non-null   object             
 8   media_link           7302 non-null   object             
 9   media_title          7302 non-null   object             
 10  is_parsed            7302 non-null   bool               
 11  exception_class      2551 non-null   object             
 12  exception_text      

In [57]:
len(set(df['url_hash'].values))

6786

In [58]:
len(set(df['feed_link'].values))

5636

In [59]:
len(set(df['canonical_link'].values))

4316

In [60]:
len(set(df['title'].values))

5578

In [62]:
len(set(df['text'].values))

4409

In [63]:
len(set(df['publish_date'].values))

4934

In [67]:
len(set(df[df['is_parsed'] == True]['url_hash'].values))

4363

In [69]:
len(set(df[df['is_parsed'] == True]['canonical_link'].values))

4315

In [73]:
df['canonical_link'].value_counts()

canonical_link
https://www.foxnews.com/us/most-unsafe-cities-holiday-season-research-shows                                                                          4
https://finance.yahoo.com/news/stock-market-news-today-stocks-on-six-week-win-streak-after-strong-jobs-report-210913952.html                         4
https://www.foxbusiness.com/real-estate/real-estate-investor-warns-us-entering-greatest-correction-lifetime                                          3
https://www.cbsnews.com/news/ozempic-counterfeit-fda-seizes-thousands-novo-nordisk/                                                                  3
https://finance.yahoo.com/news/stock-market-news-today-dow-hits-record-high-as-stocks-cap-longest-weekly-winning-streak-since-2017-181200723.html    3
                                                                                                                                                    ..
https://www.nytimes.com/2023/12/08/business/economy/jobs-report-november-2023.h

In [80]:
df[df['canonical_link'] == 'https://www.foxnews.com/us/most-unsafe-cities-holiday-season-research-shows']

Unnamed: 0,id,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text,url_hash,date_created
5492,6895,These are the most 'unsafe' cities during the ...,The following cities ranked among the top 10 m...,2023-12-24 09:00:00+00:00,approximated,[Audrey Conklin],https://www.foxnews.com/us/most-unsafe-cities-...,https://news.google.com/rss/articles/CBMiS2h0d...,https://www.foxnews.com,Fox News,True,,,878e3e2dde967480c6ab048cdf4cf4b4974f5864d77921...,2023-12-24 11:05:44
6989,6988,The 10 most ‘unsafe’ cities during the holiday...,The following cities ranked among the top 10 m...,2023-12-24 00:00:00+00:00,parsed,[],https://www.foxnews.com/us/most-unsafe-cities-...,https://news.google.com/rss/articles/CBMiTWh0d...,https://nypost.com,New York Post,True,,,26dc3749cbe4113c4ed8943c286ea3f0f341b87ff5a56f...,2023-12-24 20:05:39
7154,7155,These are the most 'unsafe' cities during the ...,Read this article for free! Plus get unlimited...,2023-12-24 09:00:00+00:00,approximated,[Audrey Conklin],https://www.foxnews.com/us/most-unsafe-cities-...,https://news.google.com/rss/articles/CBMiS2h0d...,https://www.foxnews.com,Fox News,True,,,878e3e2dde967480c6ab048cdf4cf4b4974f5864d77921...,2023-12-25 15:05:59
7221,7221,The 10 most ‘unsafe’ cities during the holiday...,The following cities ranked among the top 10 m...,2023-12-24 00:00:00+00:00,parsed,[],https://www.foxnews.com/us/most-unsafe-cities-...,https://news.google.com/rss/articles/CBMiTWh0d...,https://nypost.com,New York Post,True,,,26dc3749cbe4113c4ed8943c286ea3f0f341b87ff5a56f...,2023-12-25 21:05:22


In [81]:
df['url_hash'].value_counts()

url_hash
2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e57ded644590c058bb4    5
888d6b8928820bac5ad87240986d0cd0b7547bdc69d5346763d522f84708cc60    3
66b6701884616567b379846ac9a51d3bbfc24f0203bbe094d015f22c7ecf3120    3
9adcb5a0b9c5be767b6b804d7be1d196a81f58a3bb56c242c85a28d2fbdc5ceb    2
1e08e5682ea433e25bc4c2d2c17d0689a6a0d10a07cdcff2e8e7a8b40975833c    2
                                                                   ..
1ffbe6d52f8347924df1518dca8e8683245fb5918728d0d06cfacb15d803d6d9    1
f1d0f6aa289f03377cdf78b5ea1ef7448b30f566d2a288b31178841793559f32    1
381c09780d4dfc42b668ba7ef2fc8d5ec9366e1c813b1e3c52403827ca2eb0dc    1
8c782e07b9f3362185b4be938e9c597cf5a962b7dd0ba7469b10fbf0bbd8631a    1
be626c0c75f7d4f46d14fb6c8acd6c7d96aca8c0d4d94373010ae61c71912bbc    1
Name: count, Length: 6786, dtype: int64

In [82]:
df[df['url_hash'] == '2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e57ded644590c058bb4']

Unnamed: 0,id,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text,url_hash,date_created
1497,1489,Redfin Predicts 2024 Will Be the Year Homebuye...,,NaT,,,,https://news.google.com/rss/articles/CBMiPGh0d...,https://www.redfin.com,Redfin News,False,ArticleException,Article `download()` failed with 429 Client Er...,2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e...,2023-12-06 01:05:47
2305,2274,2023 Has Been The Least Affordable Year for Ho...,,NaT,,,,https://news.google.com/rss/articles/CBMiRWh0d...,https://www.redfin.com,Redfin News,False,ArticleException,Article `download()` failed with 429 Client Er...,2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e...,2023-12-08 09:06:38
2599,2563,2023 Has Been The Least Affordable Year for Ho...,,NaT,,,,https://news.google.com/rss/articles/CBMiRWh0d...,https://www.redfin.com,Redfin News,False,ArticleException,Article `download()` failed with 429 Client Er...,2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e...,2023-12-09 11:05:51
4038,3959,The Tide Turns for Renters as Asking Rents Pos...,,NaT,,,,https://news.google.com/rss/articles/CBMiP2h0d...,https://www.redfin.com,Redfin News,False,ArticleException,Article `download()` failed with 429 Client Er...,2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e...,2023-12-14 13:08:04
7073,7074,16% of Homes Were Affordable in 2023—Lowest Sh...,,NaT,,,,https://news.google.com/rss/articles/CBMiO2h0d...,https://www.redfin.com,Redfin News,False,ArticleException,Article `download()` failed with 429 Client Er...,2c828d4c2b4e5daabae76ca2b588229c89a92f69f2ab7e...,2023-12-25 04:05:05


In [87]:
df['title'].value_counts()[:20]

title
Fed Chair Powell's Comments Spark Cross-Asset Rally, Upend 2024 Outlooks - Bloomberg       42
BlackRock, State Street Subpoenaed in House ESG Probe - Bloomberg                          35
Citi Exiting Distressed-Debt Trading in Latest Retrenchment - Bloomberg                    27
Nvidia Sees Vietnam as Potential Second Home, Reports Say - Bloomberg                      27
Heathrow Airport: How Saudi Arabia May Ultimately Gain Control - Bloomberg                 25
Modi's Resounding State Polls Win Fuels India Market Rally - Bloomberg                     24
Toyota Halts Some SE Asia Exports on Daihatsu Scandal: Yomiuri - Bloomberg                 24
Apple to Drop Goldman Sachs for Apple Card; Chase Bank Is Ideal Replacement - Bloomberg    21
Byju's Founder Pledges Homes to Raise Funds for Staff Salaries - Bloomberg                 20
A-Rod's Slam SPAC Is Said to Plan Merger With Lynk Global - Bloomberg                      19
Used Car Prices Drop As Negative Equity Slams Buyers -

In [86]:
df[df['title'] == "Fed Chair Powell's Comments Spark Cross-Asset Rally, Upend 2024 Outlooks - Bloomberg"]

Unnamed: 0,id,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text,url_hash,date_created
2794,4589,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",73a2d05f38fcee2f65828f54f71ba0424082b5a87aaec1...,2023-12-16 15:05:16
3211,4602,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",671fdc2d0b8c1852e111c644533131850908fa74a248b8...,2023-12-16 16:05:16
3413,4660,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",439d6cb37b0c636b1fe3040c0bffbdc36a10d185753a1d...,2023-12-16 20:06:01
3456,4615,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",1e9282a5f98dc3957158bd6f564a5c959491cb1daf17cd...,2023-12-16 17:05:44
3772,4629,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",30b9996bb022177951622744c36d403d44fa046d637cad...,2023-12-16 18:05:48
4011,4674,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",e4fcbcaba8858fc62945fac530b3f60e54192955150eb6...,2023-12-16 21:06:52
4483,4645,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",d487dd27fe61a87fabdc42a7eaa985584f0f9f464b3b8d...,2023-12-16 19:05:50
4530,4436,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",82498470a7e69e1af655184d0e06c8e703b901a391f613...,2023-12-16 02:05:37
4540,4472,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",f778061eec2cd64bbd623a2251a65405393728adef13b0...,2023-12-16 05:05:20
4542,4447,Fed Chair Powell's Comments Spark Cross-Asset ...,,NaT,,,,https://news.google.com/rss/articles/CBMic2h0d...,https://www.bloomberg.com,Bloomberg,False,ValueError,"Essential fields are empty, possibly due to bo...",79fbb07b57be8c7aa449dfc8a390f4bc5a03bba8dfe564...,2023-12-16 03:05:40


In [89]:
df['feed_link'].value_counts()

feed_link
https://news.google.com/rss/articles/CBMic2h0dHBzOi8vd3d3LmJsb29tYmVyZy5jb20vbmV3cy9hcnRpY2xlcy8yMDIzLTEyLTE1L3dhbGwtc3RyZWV0LXMtd2lsZC13ZWVrLXVwZW5kcy0yMDI0LXRyYWRpbmctcGxhbnMtaW4tYmlnLWZlZC1iZXTSAQA?oc=5                                                                     42
https://news.google.com/rss/articles/CBMiZ2h0dHBzOi8vd3d3LmJsb29tYmVyZy5jb20vbmV3cy9hcnRpY2xlcy8yMDIzLTEyLTE1L2JsYWNrcm9jay1zdGF0ZS1zdHJlZXQtc3VicG9lbmFlZC1pbi1ob3VzZS1lc2ctcHJvYmXSAQA?oc=5                                                                                     35
https://news.google.com/rss/articles/CBMic2h0dHBzOi8vd3d3LmJsb29tYmVyZy5jb20vbmV3cy9hcnRpY2xlcy8yMDIzLTEyLTIwL2NpdGlncm91cC1leGl0aW5nLWRpc3RyZXNzZWQtZGVidC10cmFkaW5nLWluLWxhdGVzdC1yZXRyZW5jaG1lbnTSAQA?oc=5                                                                     27
https://news.google.com/rss/articles/CBMia2h0dHBzOi8vd3d3LmJsb29tYmVyZy5jb20vbmV3cy9hcnRpY2xlcy8yMDIzLTEyLTExL252aWRpYS1zZWVzLXZpZXRuYW0tYXMtcG90ZW50aWFsLXNlY2

In [7]:
start = "You are an investor who reads financial and business news. There may be many companies mentioned in the news, but not all of them are affected by the news. You want to understand which companies in the news are really affected by the news from an investor's point of view.  I'll send you the headline and body of the news and you extract only the companies important to the news in the format ['Company1', 'Company2']"
end = " | ATTENTION!!! I DON’T WANT YOU ADVICES HOW TO DO IT!!!! REMEMBER YOU HAVE TO EXTRACT the salient companies as [‘Company1’, 'Company2’] (this is example of format) keeping in mind you are an investor who has filtered a list of news on a particular company. You want to see only those news that can really affect the company. REMEMBER FORMAT [‘Company1’, 'Company2’]!!!!!!!"

  tokens_by_line = make_tokens_by_line(lines)


In [8]:
df = df[df['is_parsed'] == True]

df['query'] = start + ' | HEADLINE: ' + df['title'] + ' | BODYTEXT: ' + df['text'] + end
df = df[['id', 'url_hash', 'title', 'text', 'query']]
df.head()

Unnamed: 0,id,url_hash,title,text,query
0,1,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,You are an investor who reads financial and bu...
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...
3,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...
5,6,7dacd5458df4eb9d566c52b14ab1d59cf10ce37f0822e6...,S&P 500’s Historic 8.9% Rally Blindsides Skept...,(Bloomberg) -- The US stock market just posted...,You are an investor who reads financial and bu...


In [9]:
df.to_excel('raw_news_articles_202312141455.xlsx', index=False)

# Load test data (hand labeled)

In [34]:
df_test = pd.read_excel('target_raw_news_articles_202312141455.xlsx')
df_test = df_test[~df_test['target'].isna()]
df_test['target'] = df_test['target'].apply(lambda x: x.strip("[]").replace("'", ""))
df_test.shape

(148, 6)

In [35]:
df_test.head()

Unnamed: 0,id,url_hash,title,text,query,target
0,1,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,You are an investor who reads financial and bu...,Tesla
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...,X (formerly Twitter)
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...,Disney
3,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...,Zillow
4,6,7dacd5458df4eb9d566c52b14ab1d59cf10ce37f0822e6...,S&P 500’s Historic 8.9% Rally Blindsides Skept...,(Bloomberg) -- The US stock market just posted...,You are an investor who reads financial and bu...,


# Bring the target to the REL form

In [37]:
def simple_REL(text):
    el_result = requests.post(API_URL, json={
        "text": text,
        "spans": []
    }).json()
    
    mentions = [
            mention for mention in el_result if mention[-1] == "ORG"
        ]
    salient_entities = {'None'}
    if mentions:
        salient_entities = set([entity[3] for entity in mentions])
        
    return salient_entities

In [38]:
df_test['target_formatted'] = df_test['target'].apply(simple_REL)

In [39]:
df_test.head()

Unnamed: 0,id,url_hash,title,text,query,target,target_formatted
0,1,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,You are an investor who reads financial and bu...,Tesla,"{Tesla,_Inc.}"
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...,X (formerly Twitter),{Twitter}
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...,Disney,{The_Walt_Disney_Company}
3,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...,Zillow,{Zillow}
4,6,7dacd5458df4eb9d566c52b14ab1d59cf10ce37f0822e6...,S&P 500’s Historic 8.9% Rally Blindsides Skept...,(Bloomberg) -- The US stock market just posted...,You are an investor who reads financial and bu...,,{None}


In [11]:
df_test_rel = main(df_test, 'title', 'text')
df_test_rel.shape

100%|██████████| 148/148 [06:01<00:00,  2.44s/it]


(148, 6)

In [12]:
df_test_rel.head()

Unnamed: 0,headline,body_text,headline_mentions,body_text_mentions,salient_entities_org,salient_entities_set
0,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,"[[21, 5, Tesla, Tesla,_Inc., 0.591238980628300...","[[19, 5, Tesla, Tesla,_Inc., 0.664780689106363...","[[19, 5, Tesla, Tesla,_Inc., 0.664780689106363...","{Tesla,_Inc.}"
1,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,"[[52, 9, Elon Musk, Elon_Musk, 0.3872777678067...","[[56, 7, Twitter, Twitter, 0.9176650349707419,...","[[3094, 9, Elon Musk, Elon_Musk, 0.38727776780...",{Elon_Musk}
2,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,[],"[[0, 6, Disney, Walt_Disney_Studios_(division)...",[],{None}
3,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,"[[48, 6, Zillow, Zillow, 0.3872777678067984, 0...","[[50, 9, Americans, United_States, 0.520198409...","[[94, 6, Zillow, Zillow, 0.3872777678067984, 0...",{Zillow}
4,S&P 500’s Historic 8.9% Rally Blindsides Skept...,(Bloomberg) -- The US stock market just posted...,"[[53, 11, Wall Street, Wall_Street, 0.93163738...","[[1, 9, Bloomberg, Bloomberg_L.P., 0.838348418...",[],{None}


In [40]:
df_merged = df_test.merge(df_test_rel, how='outer', left_on='title', right_on='headline')
df_merged.head()

Unnamed: 0,id,url_hash,title,text,query,target,target_formatted,headline,body_text,headline_mentions,body_text_mentions,salient_entities_org,salient_entities_set
0,1,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,You are an investor who reads financial and bu...,Tesla,"{Tesla,_Inc.}","Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,"[[21, 5, Tesla, Tesla,_Inc., 0.591238980628300...","[[19, 5, Tesla, Tesla,_Inc., 0.664780689106363...","[[19, 5, Tesla, Tesla,_Inc., 0.664780689106363...","{Tesla,_Inc.}"
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...,X (formerly Twitter),{Twitter},Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,"[[52, 9, Elon Musk, Elon_Musk, 0.3872777678067...","[[56, 7, Twitter, Twitter, 0.9176650349707419,...","[[3094, 9, Elon Musk, Elon_Musk, 0.38727776780...",{Elon_Musk}
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...,Disney,{The_Walt_Disney_Company},"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,[],"[[0, 6, Disney, Walt_Disney_Studios_(division)...",[],{None}
3,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...,Zillow,{Zillow},"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,"[[48, 6, Zillow, Zillow, 0.3872777678067984, 0...","[[50, 9, Americans, United_States, 0.520198409...","[[94, 6, Zillow, Zillow, 0.3872777678067984, 0...",{Zillow}
4,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...,Zillow,{Zillow},"From affordability to property demand, here ar...","AP Photo/John Raoux, File\n\nHigh mortgage rat...","[[48, 6, Zillow, Zillow, 0.3872777678067984, 0...","[[77, 9, Americans, United_States, 0.495860628...","[[121, 6, Zillow, Zillow, 0.3872777678067984, ...",{Zillow}


In [41]:
df_merged.shape

(156, 13)

In [42]:
df_diff = df_merged[df_merged['target_formatted'] != df_merged['salient_entities_set']]
df_diff.shape

(82, 13)

In [43]:
df_diff.head()

Unnamed: 0,id,url_hash,title,text,query,target,target_formatted,headline,body_text,headline_mentions,body_text_mentions,salient_entities_org,salient_entities_set
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...,X (formerly Twitter),{Twitter},Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,"[[52, 9, Elon Musk, Elon_Musk, 0.3872777678067...","[[56, 7, Twitter, Twitter, 0.9176650349707419,...","[[3094, 9, Elon Musk, Elon_Musk, 0.38727776780...",{Elon_Musk}
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...,Disney,{The_Walt_Disney_Company},"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,[],"[[0, 6, Disney, Walt_Disney_Studios_(division)...",[],{None}
9,8,3113a30d9ee07a8cfd66424b4cf8eaa6f977021c844773...,Meta AI Chief Yann LeCun On His Open Source Mi...,In an exclusive interview leading up to the 10...,You are an investor who reads financial and bu...,Meta,{Meta_Department},Meta AI Chief Yann LeCun On His Open Source Mi...,In an exclusive interview leading up to the 10...,"[[14, 10, Yann LeCun, Yann_LeCun, 0.3872777678...","[[79, 5, LeCun, Yann_LeCun, 0.3872777678067984...",[],{None}
15,13,2bfad57b81083d98472300f7934ac351cea4cf848e3f94...,"Meta Sues FTC, Says it Has No Constitutional R...",Meta is working all the angles it can to ensur...,You are an investor who reads financial and bu...,Meta,{Meta_Department},"Meta Sues FTC, Says it Has No Constitutional R...",Meta is working all the angles it can to ensur...,[],"[[0, 4, Meta, Wikimedia_Foundation, 0.23680703...",[],{None}
16,14,4fb13e2ef5ff679e852834916229c11d0e4e6ca5f76278...,U.K. Opens Inquiry Into Jeff Zucker’s Emirati-...,Jeff Zucker’s re-entry into the global news bu...,You are an investor who reads financial and bu...,"The Daily Telegraph, The Spectator","{The_Spectator, The_Daily_Telegraph}",U.K. Opens Inquiry Into Jeff Zucker’s Emirati-...,Jeff Zucker’s re-entry into the global news bu...,"[[24, 11, Jeff Zucker, Jeff_Zucker, 0.38727776...","[[0, 11, Jeff Zucker, Jeff_Zucker, 0.387277767...","[[218, 19, The Daily Telegraph, The_Daily_Tele...",{The_Daily_Telegraph}


In [44]:
def evaluate_ner(dataset, target_col, pred_col):
    TP, FP, FN = 0, 0, 0

    # Iterate over each row in the dataset
    for index, row in dataset.iterrows():
        true_entities = row[target_col]
        pred_entities = row[pred_col]

        # Update counts
        TP += len(true_entities.intersection(pred_entities))
        FP += len(pred_entities - true_entities)
        FN += len(true_entities - pred_entities)

    # Calculate Precision, Recall, and F1 Score
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    return precision, recall, f1_score

In [46]:
precision, recall, f1_score = evaluate_ner(df_merged, 'target_formatted', 'salient_entities_set')
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Precision: 0.63
Recall: 0.36
F1 Score: 0.46


In [None]:
# Example dataset
data = {
    'predicted': [{"Moscow", "Paris"}, {"New York"}],  # Predicted entities here
    'targets': [{"Moscow", "Paris", "London"}, {"New York", "Boston"}]  # True entities here
}
dataset = pd.DataFrame(data)

precision, recall, f1_score = evaluate_ner(dataset, 'targets', 'predicted')
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

In [47]:
for row in df_merged.iterrows():
    print(f"{row[1][2][:10]} | {row[1][6]} | {row[1][-1]}")

Starting a | {'Tesla,_Inc.'} | {'Tesla,_Inc.'}
Read Linda | {'Twitter'} | {'Elon_Musk'}
Disney Rei | {'The_Walt_Disney_Company'} | {'None'}
From affor | {'Zillow'} | {'Zillow'}
From affor | {'Zillow'} | {'Zillow'}
From affor | {'Zillow'} | {'Zillow'}
From affor | {'Zillow'} | {'Zillow'}
S&P 500’s  | {'None'} | {'None'}
Some OPEC+ | {'OPEC'} | {'OPEC'}
Meta AI Ch | {'Meta_Department'} | {'None'}
OpenAI ten | {'OpenAI'} | {'OpenAI'}
How OpenAI | {'OpenAI'} | {'OpenAI'}
UiPath Inc | {'None'} | {'None'}
Asia facto | {'None'} | {'None'}
The Fed’s  | {'Federal_Reserve'} | {'Federal_Reserve'}
Meta Sues  | {'Meta_Department'} | {'None'}
U.K. Opens | {'The_Spectator', 'The_Daily_Telegraph'} | {'The_Daily_Telegraph'}
Stock mark | {'OPEC'} | {'None'}
Stock mark | {'OPEC'} | {'None'}
Stock mark | {'OPEC'} | {'None'}
Stock mark | {'OPEC'} | {'None'}
PDD earnin | {'Alibaba_Group', 'Pinduoduo', 'Pinky_Dinky_Doo'} | {'Alibaba_Group'}
JPMorgan C | {'JPMorgan_Chase'} | {'JPMorgan_Chase'}
Mortgage r | {'

  print(f"{row[1][2][:10]} | {row[1][6]} | {row[1][-1]}")
