In [2]:
import os

import pandas as pd
from tqdm import tqdm
import requests


In [3]:
def extract_salient_entities(
    data: list,
    title: str = "title",
    article: str = "article",
    id: str = "id",
):

    annotated_articles = []

    for row in tqdm(data, total=len(data)):

        if len(row[title].split()) > 3300 or len(row[article].split()) > 3300:
            continue

        # Perform mention detection on headline and body text
        el_title = requests.post(API_URL, json={
            "text": row[title],
            "spans": []
        }).json()
        el_article = requests.post(API_URL, json={
            "text": row[article],
            "spans": []
        }).json()

        # Filter mentions with the ORG tag
        headline_mentions_org = [
            mention for mention in el_title if mention[-1] == "ORG"
        ]
        body_text_mentions_org = [
            mention for mention in el_article if mention[-1] == "ORG"
        ]

        # Mark salient entities
        salient_entities_org = []
        for body_entity in body_text_mentions_org:
            if body_entity[3] in [
                headline_entity[3] for headline_entity in headline_mentions_org
            ]:
                salient_entities_org.append(body_entity)

        if salient_entities_org:
            salient_entities_org_set = set([entity[3] for entity in salient_entities_org])
        else:
            salient_entities_org_set = {'None'}
            
        # Save the annotated article
        annotated_articles.append(
            {
                "raw_news_id": row[id],
                # "headline": row[title],
                # "body_text": row[article],
                "headline_mentions": el_title,
                "body_text_mentions": el_article,
                "salient_entities_org": salient_entities_org,
                "salient_entities_set": salient_entities_org_set,
            }
        )

    return annotated_articles

In [4]:
API_URL = "http://rel:5555/api"

In [5]:
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_pass = os.getenv('DB_PASS')

In [6]:
import psycopg2
import json

# Database connection parameters
conn_params = {
    "host": os.getenv('DB_HOST'),
    "port": os.getenv('DB_PORT'),
    "database": os.getenv('DB_NAME'),
    "user": os.getenv('DB_USER'),
    "password": os.getenv('DB_PASS')
}

# Connect to the PostgreSQL database
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# SQL query to fetch the data
query = "SELECT id, title, text FROM raw_news_articles WHERE is_parsed = True;"

# Execute the query
cursor.execute(query)

# Fetch all rows
rows = cursor.fetchall()

# Close the connection
cursor.close()
conn.close()

In [7]:
rows[:1]

[(7341,
  'Will Solana (SOL) Continue Outperforming Ethereum (ETH)? Elusiv Co-Founder Weighs In',
  'Solana (SOL) has been one of the best-performing cryptocurrencies in 2023, starkly contrasting Ethereum (ETH).\n\nWhile SOL has increased by more than 1,000%, ETH has only moved upward by 90%. This raises the question: Can Solana flip Ethereum next year?\n\nWhat’s Behind the Solana Increase?\n\nSpeaking to BeInCrypto, Julian Deschler, the Co-founder of Elusiv, discussed some of the reasons for the Solana price increase this year.\n\nElusiv is a universal encryption layer for Web3, empowering the decentralized world. Its launch on the Solana Mainnet in March 2023 followed a successful $3.5 million seed round in November.\n\nMr. Deschler believes the reason for SOL’s outperformance compared to ETH lies in its technological capabilities—fast and cost-effective transactions, eliminating the need for high gas fees in swaps or DeFi interactions.\n\nAlso, SOL gained prominence this year throug

In [8]:
# Convert rows to JSON
data = [{"id": row[0], "title": row[1], "text": row[2]} for row in rows]
json_data = json.dumps(data)

In [9]:
data = json.loads(json_data)

In [13]:
data[0]

{'id': 7341,
 'title': 'Will Solana (SOL) Continue Outperforming Ethereum (ETH)? Elusiv Co-Founder Weighs In',
 'text': 'Solana (SOL) has been one of the best-performing cryptocurrencies in 2023, starkly contrasting Ethereum (ETH).\n\nWhile SOL has increased by more than 1,000%, ETH has only moved upward by 90%. This raises the question: Can Solana flip Ethereum next year?\n\nWhat’s Behind the Solana Increase?\n\nSpeaking to BeInCrypto, Julian Deschler, the Co-founder of Elusiv, discussed some of the reasons for the Solana price increase this year.\n\nElusiv is a universal encryption layer for Web3, empowering the decentralized world. Its launch on the Solana Mainnet in March 2023 followed a successful $3.5 million seed round in November.\n\nMr. Deschler believes the reason for SOL’s outperformance compared to ETH lies in its technological capabilities—fast and cost-effective transactions, eliminating the need for high gas fees in swaps or DeFi interactions.\n\nAlso, SOL gained promine

In [None]:
{'id': some int ID,
 'title': 'News title',
 'text': 'News test'}


In [41]:
# df_test = pd.read_excel('target_raw_news_articles_202312141455.xlsx')
# df_test = df_test[~df_test['target'].isna()]
# df_test['target'] = df_test['target'].apply(lambda x: x.strip("[]").replace("'", ""))
# df_test.shape

In [42]:
# df_test.head()

In [53]:
df_test_rel = main(data[:10], 'title', 'text')

100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


In [52]:
df_test_rel[0]

{'raw_news_id': 7341,
 'headline': 'Will Solana (SOL) Continue Outperforming Ethereum (ETH)? Elusiv Co-Founder Weighs In',
 'body_text': 'Solana (SOL) has been one of the best-performing cryptocurrencies in 2023, starkly contrasting Ethereum (ETH).\n\nWhile SOL has increased by more than 1,000%, ETH has only moved upward by 90%. This raises the question: Can Solana flip Ethereum next year?\n\nWhat’s Behind the Solana Increase?\n\nSpeaking to BeInCrypto, Julian Deschler, the Co-founder of Elusiv, discussed some of the reasons for the Solana price increase this year.\n\nElusiv is a universal encryption layer for Web3, empowering the decentralized world. Its launch on the Solana Mainnet in March 2023 followed a successful $3.5 million seed round in November.\n\nMr. Deschler believes the reason for SOL’s outperformance compared to ETH lies in its technological capabilities—fast and cost-effective transactions, eliminating the need for high gas fees in swaps or DeFi interactions.\n\nAlso, S