In [1]:
import os

import sqlalchemy
import psycopg2
from sqlalchemy import create_engine

import pandas as pd

from REL.entity_disambiguation import EntityDisambiguation
from REL.mention_detection import MentionDetection 
from REL.ner import Cmns, load_flair_ner
from REL.utils import process_results
# from REL.server import make_handler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

ModuleNotFoundError: No module named 'REL'

In [31]:
def preprocessing(row: pd.Series) -> dict:
    processed = {
        'title': [row['title'], []], 
        'article': [row['article'], []]
        }
    return processed

def find_mentions(text, mention_detection, tagger_ner):
    mentions_dataset, n_mentions = mention_detection.find_mentions(text, tagger_ner)
    return mentions_dataset

def disambiguate_entities(mentions_dataset, entity_disambiguation):
    predictions, timing = entity_disambiguation.predict(mentions_dataset)
    return predictions

def main(dataset, base_url, wiki_version, model_alias, tagger, df_name):

    plug = [{
        'mention': 'None',
        'context': ('None'),
        'candidates': [['', 0.0]],
        'gold': [''],
        'pos': 0,
        'sent_idx': 0,
        'ngram': '',
        'end_pos': 0,
        'sentence': '',
        'conf_md': 0.0,
        'tag': '',
        'sent': ''
        }]

    mention_detection = MentionDetection(base_url, wiki_version)
    tagger = tagger
    
    config = {
        "mode": "eval",
        "model_path": model_alias,
    }
    entity_disambiguation = EntityDisambiguation(base_url, wiki_version, config)

    annotated_articles = []
    
    for index, row in tqdm(dataset.iterrows()):

        processed_row = preprocessing(row)

        # Perform mention detection on headline and body text
        mentions_dataset = find_mentions(processed_row, mention_detection, tagger)
        
        # Insert plug if title or article mentions are empty
        if not mentions_dataset['title']:
            mentions_dataset['title'] = plug
        if not mentions_dataset['article']:
            mentions_dataset['article'] = plug

        # Disambiguate detected mentions
        mentions_disambiguated = disambiguate_entities(mentions_dataset, entity_disambiguation)
        result = process_results(mentions_dataset, mentions_disambiguated, processed_row)

        # Filter mentions with the ORG tag
        headline_mentions = [mention for mention in result['title'] if mention[-1] == 'ORG']
        body_text_mentions = [mention for mention in result['article'] if mention[-1] == 'ORG']

        # Check if any named entities were found in the headline
        if not headline_mentions or not mentions_disambiguated['title']:
            continue

        # Mark salient entities
        salient_entities = []
        for body_entity in body_text_mentions:
            if body_entity[3] in [headline_entity[3] for headline_entity in headline_mentions]:
                salient_entities.append(body_entity)

        # Check if any named entities were found in the body text
        if not body_text_mentions or not mentions_disambiguated['article']:
            continue
        
        salient_entities_list = list(set([entity[3] for entity in salient_entities]))
        # Save the annotated article
        annotated_articles.append({
            'headline': processed_row['title'][0],
            'body_text': processed_row['article'][0],
            'headline_mentions': headline_mentions,
            'body_text_mentions': body_text_mentions,
            'salient_entities': salient_entities,
            'salient_entities_list': salient_entities_list,
            
        })

        # Create a new DataFrame from the annotated_articles list
        annotated_articles_df = pd.DataFrame(annotated_articles)
    
    return annotated_articles_df

    # Save the annotated articles as a CSV file
    annotated_articles_df.to_csv(df_name, index=False)

In [2]:
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_pass = os.getenv('DB_PASS')

In [3]:
# Create the connection string
connection_str = f'postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'

# Create the engine
engine = create_engine(connection_str)

In [4]:
query = 'SELECT * FROM raw_news_articles'
df = pd.read_sql(query, engine)

In [5]:
df.head()

Unnamed: 0,id,title,text,publish_date,publish_date_source,authors,canonical_link,feed_link,media_link,media_title,is_parsed,exception_class,exception_text,url_hash,date_created
0,1,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,2023-11-30 00:00:00+00:00,parsed,"[Akash Sriram Hyunjoo Jin Abhirup Roy, Akash S...",https://www.reuters.com/business/autos-transpo...,https://news.google.com/rss/articles/CBMie2h0d...,https://www.reuters.com,Reuters,True,,,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,2023-12-01 06:05:36
1,2,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,2023-11-30 00:00:00+00:00,parsed,"[Jonathan Vanian, In]",https://www.cnbc.com/2023/11/30/read-linda-yac...,https://news.google.com/rss/articles/CBMiZmh0d...,https://www.cnbc.com,CNBC,True,,,63329ae2128c913d54235a98b155a792c49b8adad89d59...,2023-12-01 06:05:38
2,3,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,2023-11-30 22:10:15+00:00,parsed,[Jill Goldsmith],https://deadline.com/2023/11/disney-reinstates...,https://news.google.com/rss/articles/CBMiaWh0d...,https://deadline.com,Deadline,True,,,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,2023-12-01 06:05:39
3,4,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,2023-11-30 00:00:00+00:00,parsed,[Phil Rosen],https://markets.businessinsider.com/news/commo...,https://news.google.com/rss/articles/CBMihQFod...,https://markets.businessinsider.com,Markets Insider,True,,,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,2023-12-01 06:05:41
4,5,Dow Jones Hits 2023 High As Salesforce Soars; ...,,NaT,,,,https://news.google.com/rss/articles/CBMijgFod...,https://www.investors.com,Investor's Business Daily,False,ArticleException,Article `download()` failed with 403 Client Er...,03cedd18dd1facb6574b7a241aa689a6401b63a6653a15...,2023-12-01 06:05:42


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3860 entries, 0 to 3859
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   id                   3860 non-null   int64              
 1   title                3860 non-null   object             
 2   text                 2570 non-null   object             
 3   publish_date         2570 non-null   datetime64[ns, UTC]
 4   publish_date_source  2570 non-null   object             
 5   authors              2570 non-null   object             
 6   canonical_link       2570 non-null   object             
 7   feed_link            3860 non-null   object             
 8   media_link           3860 non-null   object             
 9   media_title          3860 non-null   object             
 10  is_parsed            3860 non-null   bool               
 11  exception_class      1290 non-null   object             
 12  exception_text      

In [7]:
start = "You are an investor who reads financial and business news. There may be many companies mentioned in the news, but not all of them are affected by the news. You want to understand which companies in the news are really affected by the news from an investor's point of view.  I'll send you the headline and body of the news and you extract only the companies important to the news in the format ['Company1', 'Company2']"
end = " | ATTENTION!!! I DON’T WANT YOU ADVICES HOW TO DO IT!!!! REMEMBER YOU HAVE TO EXTRACT the salient companies as [‘Company1’, 'Company2’] (this is example of format) keeping in mind you are an investor who has filtered a list of news on a particular company. You want to see only those news that can really affect the company. REMEMBER FORMAT [‘Company1’, 'Company2’]!!!!!!!"

  tokens_by_line = make_tokens_by_line(lines)


In [8]:
df = df[df['is_parsed'] == True]

df['query'] = start + ' | HEADLINE: ' + df['title'] + ' | BODYTEXT: ' + df['text'] + end
df = df[['id', 'url_hash', 'title', 'text', 'query']]
df.head()

Unnamed: 0,id,url_hash,title,text,query
0,1,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,You are an investor who reads financial and bu...
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...
3,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...
5,6,7dacd5458df4eb9d566c52b14ab1d59cf10ce37f0822e6...,S&P 500’s Historic 8.9% Rally Blindsides Skept...,(Bloomberg) -- The US stock market just posted...,You are an investor who reads financial and bu...


In [9]:
df.to_excel('raw_news_articles_202312141455.xlsx', index=False)

In [16]:
df_target = pd.read_excel('target_raw_news_articles_202312141455.xlsx')

In [17]:
df_target = df_target[~df_target['target'].isna()]
df_target.head()

Unnamed: 0,id,url_hash,title,text,query,target
0,1,83bc522d93ce214c43182256b8f805b46a5c2d39f214d4...,"Starting at $60,990, Tesla's Cybertruck is pri...",Nov 30 (Reuters) - Tesla's (TSLA.O) long-delay...,You are an investor who reads financial and bu...,['Tesla']
1,2,63329ae2128c913d54235a98b155a792c49b8adad89d59...,Read Linda Yaccarino’s message to X employees ...,Linda Yaccarino sent a memo to employees of X ...,You are an investor who reads financial and bu...,['X (formerly Twitter)']
2,3,694a1ed27646c306ab80db39a68cb48f3d1e1f8ca90e15...,"Disney Reinstates Dividend, Amends Bylaws Amid...",Disney today announced a cash dividend of $0.3...,You are an investor who reads financial and bu...,['Disney']
3,4,69e1ad3a026abd7ed076ba72aa18884568238cfb0a23ab...,"From affordability to property demand, here ar...",High mortgage rate and home prices sidelined m...,You are an investor who reads financial and bu...,['Zillow']
4,6,7dacd5458df4eb9d566c52b14ab1d59cf10ce37f0822e6...,S&P 500’s Historic 8.9% Rally Blindsides Skept...,(Bloomberg) -- The US stock market just posted...,You are an investor who reads financial and bu...,['None']


In [18]:
df_target.shape

(148, 6)

In [29]:
df_check = df.merge(df_target[['id', 'url_hash', 'target']], how='inner', on=['url_hash', 'id'])
df_check.shape

(148, 6)

In [30]:
len(set(df_check['url_hash']))

148

In [32]:
base_url = '/home/ec2-user/environments/styx_env/styx/data/REL/'
wiki_version = 'wiki_2019'
model_alias = 'ed-wiki-2019'

In [35]:
tagger_ner = load_flair_ner("ner-fast")
tagger_ngram = Cmns(base_url, wiki_version, n=5)

NameError: name 'load_flair_ner' is not defined