# Events Example

In [1]:
import signal_api.signalAPI as signalAPI
import pandas as pd
from itertools import groupby, islice
import os
from tqdm import tqdm
from textwrap import wrap

pd.set_option('display.max_colwidth', 0)
pd.options.display.html.use_mathjax = False

## Connect to the API

In [2]:
client_id = os.environ['SIGNAL_API_ID']
client_secret = os.environ['SIGNAL_API_SECRET']
signal_api = signalAPI.Connection(client_id=client_id, client_secret=client_secret)

## Search For Entity And Topic IDs 

In [3]:
list(signal_api.entities({'name': 'Long COVID'}))

[{'id': '1021d2f6-e57a-308c-b232-d6c6f2c2588e',
  'type': 'disease',
  'name': 'Long COVID'}]

In [4]:
list(signal_api.topics({'name': 'Healthcare'}))

[{'id': 'ec838d2b-49db-457a-b42f-2889fb86b2eb',
  'name': 'Healthcare',
  'description': 'The topic "healthcare" includes all coverage related to the healthcare industry. Keywords include Healthcare, treatment of disease, preventive healthcare, healthcare issues, health insurance, health benefits & coverage, health reforms, publicly funded national healthcare system,  healthcare crisis, mental health care, healthcare quality improvement, healthcare cost containment, healthcare right or privilege, healthcare patient safety, healthcare Infection control practices, pervasive healthcare, health care challenges, healthcare cost, healthcare-associated infections.',
  'private': False},
 {'id': '0d6c8ffc-d2fd-47bc-b304-fff55e76cd19',
  'name': 'Healthcare Crisis',
  'private': False},
 {'id': 'c3f7b9d6-9420-46fb-abb2-490382212189',
  'name': 'Transformation of Healthcare',
  'description': 'Transformation of healthcare is the topic that covers content related to changing healthcare due to adv

## Search events with the following filters (Entities, Topics, Labels and Sources)

In [27]:
entities = {
    'Long Covid': '1021d2f6-e57a-308c-b232-d6c6f2c2588e',
}

# uncomment line below to use all signal entities
# entities = {}

topics = {
    'Healthcare': 'ec838d2b-49db-457a-b42f-2889fb86b2eb',
    'Healthcare Crisis': '0d6c8ffc-d2fd-47bc-b304-fff55e76cd19',
    'Transformation of Healthcare': 'c3f7b9d6-9420-46fb-abb2-490382212189'
}

# uncomment line below to use all signal topics
# topics = {} 

# # A label can contain a maximum of 6 words
# labels = {
#     'analysis of workers compensation claims'
# }

# sources = {
#     'Yahoo News UK': '008a1423-3f16-4de1-8b7e-b0ee2b793655',
#     'Washington Post' : '38663047-4bae-42e8-9474-bd6201f1f492'
# }

## Use the last year as a date range

In [6]:
end_date = pd.to_datetime('today').to_period('D') - 1
start_date = end_date - 365
start_date, end_date

(Period('2022-04-04', 'D'), Period('2023-04-04', 'D'))

### Query the Events API

In [28]:
query = {
      "where": {
          "date": {
              "gte": str(start_date),
              "lte": str(end_date)
      },
    },
    "size": 1000
}

if entities:
  query['where']["entities"] = {
    "id": {
        # include events about any of the entities
        "any": list(entities.values())

        # note you can seach for events about all of the entities too
        # "all": list(entities.values())
    }
  }

if topics: 
  query['where']["topics"] = {
    "id": {
        # include events about any of the topics
        "any": list(topics.values())

        # note you can seach for events about all of the topics too
        # "all": list(topics.values())
    }
  }

if 'labels' in globals() and labels:
  query['where']["labels"] = {
    # include events about any of the labels
    "any": list(labels)

    # note you can seach for events about all of the labels too
    # "all": list(labels.values())
    
  }

if 'sources' in globals() and sources: 
  query['where']["sources"] = {
    "id": {
        # include events about any of the sources
        "any": list(sources.values())

        # note you can seach for events about all of the sources too
        # "all": list(sources.values())
    }
  }


response = signal_api.events(query)
# remove results that come from a small number of sources
response = (event for event in response if event['source-count-global'] >= (event['story-count-global'] / 5))
# take the first 10 events
events = list(response)[:10]

## Define function for searching documents by story id

In [25]:
def search_by_story_id(story_ids: list, entity_ids: list, topic_ids: list, source_ids: list = None,  n=3):
    """
    return one document for each story-id in "story_ids" 
    return the results in the same order as story_ids
    """
    # record the ordering of the story_ids
    rank = {story_id: rnk for rnk, story_id in enumerate(story_ids)}

    # query the seach endpoint
    params = {
        'where': {
            'story-id': {
                'any': list(story_ids)
            },
        },
        'size': 500
    }

    if entity_ids:
        params['where']['entities'] = {
            'id': {'any': entity_ids},
            'salient-only': True
        }
        
    if topic_ids:
        params['where']['topics'] = {
            'id': {'any': topic_ids}
        }
    
    if source_ids:
        params['where']['source'] = {
            'id': {'any': source_ids}
        }

    response = (item['document'] for item in signal_api.search(params))

    # put the results in the correct order
    ordered = sorted(response, key=lambda document: rank[document['story-id']])

    # deduplicate the results
    grouped = islice(groupby(ordered, lambda x: x['story-id']), n)
    return [next(group) for _, group in grouped]

### Get the documents from the Search API

In [29]:
arg = {'story_ids': event['story-ids'],
    'entity_ids': list(entities.values()), 
    'topic_ids': list(topics.values())}

if 'sources' in globals():
    arg['source_ids'] = list(sources.values())

for event in tqdm(events):
    event['documents'] = search_by_story_id(**arg)
    

100%|██████████| 1/1 [00:02<00:00,  2.22s/it]


### Display the Results

In [30]:
events_df = pd.DataFrame([
    {
        'hash': event['hash'],
        'date': event['date'],
        'story-count-global': event['story-count-global'],
        'source-count-global': event['source-count-global'],
        'entities': [entity['name'] for entity in event['entities']],
        'topics': [topic['name'] for topic in event['topics']][:3], # just use the first 3 topics 
        'labels': event['labels'],
        'headlines': [doc['title'] for doc in event['documents']],
    }
    for event in events
])
events_df

Unnamed: 0,hash,date,story-count-global,source-count-global,entities,topics,labels,headlines
0,02f1-9b9fb90e-133b442d-95caae33-aab16fe1,2023-01-24,39,315,"[Long COVID, COVID 19]","[Labour Market, Employee Remuneration, Health Technology]","[long covid, new york state insurance fund, analysis of workers compensation claims, work, workers compensation]","[Organ damage persists in nearly 60% of COVID-19 patients for a year after initial diagnosis: study, Organ damage persists in nearly 60% of long-term COVID-19 patients one year after initial diagnosis: study, Organ damage persists in 59% of long Covid patients a year after diagnosis | Health]"


## Search event by hash

In [31]:
# You can retrieve an event by hash to get additional metadata about the event such as source ids 
# and additional story ids

event_hash = events_df.iloc[0]['hash']
event = signal_api.get_event(event_hash)

# show the first 10 sources that mentioned this event

[signal_api.get_source(source_id)['source']['name'] for source_id in tqdm(event['source-ids'][:10])]

100%|██████████| 10/10 [00:04<00:00,  2.21it/s]


['Claiborne Progress',
 'KULR 8',
 'Hazard-Herald.com',
 'MyFox Yakima',
 'Washington Daily News',
 'Helena Independent',
 'Fremont Tribune',
 'Racine Journal Times',
 'Herald & Review',
 'Statesville Record & Landmark']