In [1]:
import os

import pandas as pd
from tqdm import tqdm
import requests

In [15]:
def extract_salient_entities(
    data: list,
    title: str = "title",
    article: str = "article",
    id: str = "id",
):

    annotated_articles = []

    for row in tqdm(data, total=len(data)):

        if len(row[title].split()) > 3300 or len(row[article].split()) > 3300:
            continue

        # Perform mention detection on headline and body text
        el_title = requests.post(API_URL, json={
            "text": row[title],
            "spans": []
        }).json()
        el_article = requests.post(API_URL, json={
            "text": row[article],
            "spans": []
        }).json()

        # Filter mentions with the ORG tag
        headline_mentions_org = [
            mention for mention in el_title if mention[-1] == "ORG"
        ]
        print(headline_mentions_org)
        body_text_mentions_org = [
            mention for mention in el_article if mention[-1] == "ORG"
        ]

        # Mark salient entities
        salient_entities_org = []
        for body_entity in body_text_mentions_org:
            if body_entity[3] in [
                headline_entity[3] for headline_entity in headline_mentions_org
            ]:
                salient_entities_org.append(body_entity)

        if salient_entities_org:
            salient_entities_org_set = set([entity[3] for entity in salient_entities_org])
        else:
            salient_entities_org_set = {'None'}
            
        # Save the annotated article
        annotated_articles.append(
            {
                "raw_news_id": row[id],
                "headline_mentions": el_title,
                "body_text_mentions": el_article,
                "salient_entities_org": salient_entities_org,
                "salient_entities_set": salient_entities_org_set,
            }
        )

    return annotated_articles

In [3]:
API_URL = "http://rel:5555/api"

In [4]:
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_pass = os.getenv('DB_PASS')

In [21]:
import psycopg2
import json

# Database connection parameters
conn_params = {
    "host": os.getenv('DB_HOST'),
    "port": os.getenv('DB_PORT'),
    "database": os.getenv('DB_NAME'),
    "user": os.getenv('DB_USER'),
    "password": os.getenv('DB_PASS')
}

# Connect to the PostgreSQL database
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# SQL query to fetch the data
query = "SELECT id, title, text FROM raw_news_articles WHERE is_parsed = True limit 30;"

# Execute the query
cursor.execute(query)

# Fetch all rows
rows = cursor.fetchall()

# Close the connection
cursor.close()
conn.close()

In [22]:
# Convert rows to JSON
data = [{"id": row[0], "title": row[1], "text": row[2]} for row in rows]
json_data = json.dumps(data)

In [23]:
json.dumps(data[1])

'{"id": 11890, "title": "Nio stock price forecast: analysis as it breaks key support", "text": "Nio (NYSE: NIO) stock price is in its third consecutive week in the red as concerns about the EV industry continued. The stock crashed to a low of $6.55 on Tuesday and is now sitting at its lowest level since June 2020. It has fallen sharply from its all-time high of $67, which it reached in January 2021.\\n\\nEV bubble and saturation\\n\\nCopy link to section\\n\\nNio and other EV companies are not doing well. Tesla, often seen as the gold standard of the sector, has crashed by over 28% from its highest point in 2023. It has also been overtaken by Byd and forced to slash prices in a bid to gain market share and hit its production target.\\n\\nAt the same time, smaller EV companies like Mullen Automotive, Canoo, and Fisker are incinerating cash. The sector is also going through a period of saturation as most companies boost their vehicle production.\\n\\nNio is caught up in this challenge as

In [24]:
json_data



In [25]:
data = json.loads(json_data)

In [26]:
!curl -X POST http://rel:5555/api -H "Content-Type: application/json" -d '{"text": "China Is Stealing AI Secrets to Turbocharge Spying, U.S. Says", "spans": []}'
!curl -X POST http://localhost:5555/api -H "Content-Type: application/json" -d '{"text": "China Is Stealing AI Secrets to Turbocharge Spying, U.S. Says", "spans": []}'

[[0, 5, "China", "China", 0.8694822901752369, 0.9968340992927551, "LOC"], [52, 4, "U.S.", "United_States", 0.521479391215041, 0.9096355438232422, "LOC"]]curl: (7) Failed to connect to localhost port 5555 after 0 ms: Connection refused


In [27]:
[data[0], data[1]]

[{'id': 11889,
  'title': 'The weight loss drug market may soon get more crowded. Here are the companies trying to enter the booming space',
  'text': 'In this article VKTX Follow your favorite stocks CREATE FREE ACCOUNT\n\nStill life of Wegovy an injectable prescription weight loss medicine that has helped people with obesity. It should be used with a weight loss plan and physical activity. Michael Siluk | UCG | Getty Images\n\nBoehringer Ingelheim\n\nBoehringer Ingelheim is developing a weight loss drug with Danish biotech firm Zealand Pharma. That company has been working on obesity treatments for nearly a decade. Their experimental drug works by targeting two gut hormones: GLP-1 to suppress appetite, and glucagon to increase energy expenditure. Some popular weight loss drugs such as Novo Nordisk\'s Wegovy only target GLP-1. Boehringer Ingelheim in August said it was moving the drug, called survodutide, into a late-stage study, bringing it one step closer to potential Food and Drug 

In [28]:
# df_test = pd.read_excel('target_raw_news_articles_202312141455.xlsx')
# df_test = df_test[~df_test['target'].isna()]
# df_test['target'] = df_test['target'].apply(lambda x: x.strip("[]").replace("'", ""))
# df_test.shape

In [29]:
# df_test.head()

In [30]:
API_URL = "http://rel:5555/api"

In [33]:
%%time
df_test_rel = extract_salient_entities(data[:30], 'title', 'text')

  3%|▎         | 1/30 [00:02<01:12,  2.50s/it]

[]


  7%|▋         | 2/30 [00:04<00:55,  1.97s/it]

[[0, 3, 'Nio', 'Nio,_Kagawa', 0.5552607703400925, 0.6952757835388184, 'ORG']]


 10%|█         | 3/30 [00:10<01:43,  3.83s/it]

[]


 13%|█▎        | 4/30 [00:13<01:32,  3.57s/it]

[[76, 9, 'App Store', 'App_store', 0.7934676308181613, 0.42470258474349976, 'ORG']]


 17%|█▋        | 5/30 [00:14<01:11,  2.85s/it]

[[15, 3, 'FTC', 'Federal_Trade_Commission', 0.5664531389516141, 0.9950453042984009, 'ORG']]


 20%|██        | 6/30 [00:17<01:02,  2.61s/it]

[[7, 4, 'Meta', 'Wikimedia_Foundation', 0.27603076661128806, 0.7138872742652893, 'ORG'], [52, 2, 'AI', 'Artificial_intelligence', 0.8301767561414389, 0.39137348532676697, 'ORG']]


 23%|██▎       | 7/30 [00:18<00:50,  2.20s/it]

[[40, 4, 'TSMC', 'TSMC', 0.3872777678067984, 0.786455512046814, 'ORG']]


 27%|██▋       | 8/30 [00:19<00:41,  1.91s/it]

[[0, 6, 'Google', 'Google', 0.8863024973224573, 0.9923768043518066, 'ORG']]


 30%|███       | 9/30 [00:20<00:35,  1.67s/it]

[]


 33%|███▎      | 10/30 [00:23<00:41,  2.06s/it]

[[0, 4, 'TSMC', 'TSMC', 0.3872777678067984, 0.6393302083015442, 'ORG']]


 37%|███▋      | 11/30 [00:24<00:31,  1.66s/it]

[]


 40%|████      | 12/30 [00:26<00:33,  1.89s/it]

[]


 43%|████▎     | 13/30 [00:31<00:43,  2.57s/it]

[[4, 2, 'EU', 'Europe', 0.5544414210859597, 0.9997134804725647, 'ORG']]


 47%|████▋     | 14/30 [00:34<00:47,  2.95s/it]

[]


 50%|█████     | 15/30 [00:40<00:58,  3.88s/it]

[[43, 5, 'Howey', 'Steve_Howey_(footballer)', 0.8575828481528511, 0.46409735083580017, 'ORG'], [59, 3, 'SEC', 'U.S._Securities_and_Exchange_Commission', 0.9010235252266995, 0.8273033499717712, 'ORG']]


 53%|█████▎    | 16/30 [00:46<01:02,  4.49s/it]

[[0, 3, 'CMS', 'Content_management_system', 0.8383397061154062, 0.9432158470153809, 'ORG']]


 57%|█████▋    | 17/30 [00:48<00:47,  3.69s/it]

[]


 60%|██████    | 18/30 [00:50<00:37,  3.14s/it]

[]


 63%|██████▎   | 19/30 [00:52<00:32,  2.93s/it]

[]


 67%|██████▋   | 20/30 [00:54<00:24,  2.43s/it]

[[20, 6, 'Google', 'Google', 0.8352623390804887, 0.9504076838493347, 'ORG']]


 70%|███████   | 21/30 [00:58<00:25,  2.86s/it]

[]


 73%|███████▎  | 22/30 [01:00<00:22,  2.84s/it]

[[0, 6, 'Uniqlo', 'Uniqlo', 0.3872777678067984, 0.8342874646186829, 'ORG']]


 77%|███████▋  | 23/30 [01:03<00:19,  2.76s/it]

[[44, 5, 'Ponzi', 'Ponzi_scheme', 0.8001520045647987, 0.5998007655143738, 'ORG']]


 80%|████████  | 24/30 [01:07<00:18,  3.14s/it]

[[22, 3, 'Fed', 'Federal_Reserve', 0.42498464007686576, 0.9994378685951233, 'ORG']]


 83%|████████▎ | 25/30 [01:08<00:12,  2.45s/it]

[]


 87%|████████▋ | 26/30 [01:09<00:08,  2.15s/it]

[]


 90%|█████████ | 27/30 [01:12<00:06,  2.21s/it]

[[0, 14, 'Japan Airlines', 'Japan_Airlines', 0.3872777678067984, 0.8287513852119446, 'ORG']]


 93%|█████████▎| 28/30 [01:16<00:05,  2.97s/it]

[[63, 9, 'Food Lion', 'Food_Lion', 0.3872777678067984, 0.6047020852565765, 'ORG']]


 97%|█████████▋| 29/30 [01:20<00:03,  3.02s/it]

[[16, 5, 'Tesla', 'Tesla,_Inc.', 0.5185897485305176, 0.8832705020904541, 'ORG']]


100%|██████████| 30/30 [01:22<00:00,  2.74s/it]

[]
CPU times: user 181 ms, sys: 51.6 ms, total: 232 ms
Wall time: 1min 22s





In [46]:
df_test_rel

[{'raw_news_id': 11890,
  'headline_mentions': [[0,
    3,
    'Nio',
    'Nio,_Kagawa',
    0.5552607703400925,
    0.6952757835388184,
    'ORG']],
  'body_text_mentions': [[0,
    3,
    'Nio',
    'Nio,_Kagawa',
    0.40018616351910574,
    0.4923764765262604,
    'ORG'],
   [5,
    4,
    'NYSE',
    'New_York_Stock_Exchange',
    0.9521911380366453,
    0.592521071434021,
    'ORG'],
   [11,
    3,
    'NIO',
    'Non-blocking_I/O_(Java)',
    0.9549971463307042,
    0.9750804305076599,
    'ORG'],
   [94,
    2,
    'EV',
    'Electric_vehicle',
    0.7760938844161743,
    0.34829381108283997,
    'ORG'],
   [356,
    3,
    'Nio',
    'Nio,_Kagawa',
    0.6561806777425374,
    0.9394270181655884,
    'ORG'],
   [403,
    5,
    'Tesla',
    'Tesla,_Inc.',
    0.6662987499848904,
    0.9134042859077454,
    'ORG'],
   [543, 3, 'Byd', 'BYD_Auto', 0.3872783780234141, 0.9672845602035522, 'ORG'],
   [710,
    6,
    'Fisker',
    'Henrik_Fisker',
    0.9201356676621417,
    0.867009

In [47]:
len(df_test_rel[1]['salient_entities_set'])

IndexError: list index out of range

In [84]:
df_test_rel[1]['salient_entities_set']

{'United_States_Coast_Guard'}

In [85]:
df_test_rel[0]['salient_entities_set']

{'None'}

In [87]:
len(df_test_rel[1]['salient_entities_org'])

6

In [89]:
len([
    [17, 16, 'U.S. Coast Guard', 'United_States_Coast_Guard', 0.8891559804540544, 0.8521444400151571, 'ORG'], 
    [192, 4, 'USCG', 'United_States_Coast_Guard', 0.9111871297938334, 0.47667717933654785, 'ORG'], 
    [791, 17, 'U.S. Coast Guard.', 'United_States_Coast_Guard', 0.3872777678067984, 0.8202411532402039, 'ORG'], 
    [816, 16, 'U.S. Coast Guard', 'United_States_Coast_Guard', 0.43678396048090334, 0.7355291048685709, 'ORG'], 
    [918, 16, 'U.S. Coast Guard', 'United_States_Coast_Guard', 0.47601745960158903, 0.8914013306299845, 'ORG']
])

5

In [90]:
df_test_rel[1]['salient_entities_org']

[[17,
  16,
  'U.S. Coast Guard',
  'United_States_Coast_Guard',
  0.30562666525715215,
  0.8639779289563497,
  'ORG'],
 [190,
  4,
  'USCG',
  'United_States_Coast_Guard',
  0.7504895236536515,
  0.9649078845977783,
  'ORG'],
 [297,
  11,
  'Coast Guard',
  'United_States_Coast_Guard',
  0.8574496275930025,
  0.8526458442211151,
  'ORG'],
 [777,
  16,
  'U.S. Coast Guard',
  'United_States_Coast_Guard',
  0.27727612688077324,
  0.8181795080502828,
  'ORG'],
 [800,
  16,
  'U.S. Coast Guard',
  'United_States_Coast_Guard',
  0.2773915748522871,
  0.8772232333819071,
  'ORG'],
 [902,
  16,
  'U.S. Coast Guard',
  'United_States_Coast_Guard',
  0.2773879666549176,
  0.8078232804934183,
  'ORG']]

In [18]:
def extract_salient_entities(
    data: list,
    API_URL: str = "http://rel:5555/api",
    title: str = "title",
    article: str = "text",
    id: str = "id",
):
    annotated_articles = []
    processed_ids = []

    for row in data:
        # row = row.dict()

        if len(row[title].split()) > 3300 or len(row[article].split()) > 3300:
            continue

        try:
            # Perform mention detection on headline and body text
            el_title = requests.post(
                API_URL, json={"text": row[title], "spans": []}
            ).json()
            el_article = requests.post(
                API_URL, json={"text": row[article], "spans": []}
            ).json()
        except Exception as e:
            continue

        # Filter mentions with the ORG tag
        headline_mentions_org = [
            mention for mention in el_title if mention[-1] == "ORG"
        ]
        body_text_mentions_org = [
            mention for mention in el_article if mention[-1] == "ORG"
        ]

        # Mark salient entities
        salient_entities_org = []
        for body_entity in body_text_mentions_org:
            if body_entity[3] in [
                headline_entity[3] for headline_entity in headline_mentions_org
            ]:
                salient_entities_org.append(body_entity)

        if salient_entities_org:
            salient_entities_org_set = set(
                [entity[3] for entity in salient_entities_org]
            )
        else:
            salient_entities_org_set = {"None"}

        # Save the annotated article
        annotated_articles.append(
            {
                "raw_news_id": row[id],
                "headline_mentions": el_title,
                "body_text_mentions": el_article,
                "salient_entities_org": salient_entities_org,
                "salient_entities_set": salient_entities_org_set,
            }
        )
        processed_ids.append(row[id])

    return annotated_articles


In [19]:
df_test_rel = extract_salient_entities([data[0], data[1]], 'title', 'text')

In [21]:
df_test_rel

[]