In [16]:
import signalAPI
import pandas as pd
from itertools import groupby, islice
import os
from tqdm import tqdm
from textwrap import wrap

pd.set_option('display.max_colwidth', 0)
pd.options.display.html.use_mathjax = False

# Getting Started with the Events API

## Introduction

The Events API endpoint identifies clusters of similar news coverage allowing users to easily identify news events that could impact them or their business.

Examples of questions this data can answer include:

* Have there been any events involving Company X in the past day/month/year/15 months?
* What are the latest developments in the food and beverage industry?
* Have there been any product recalls recently?
* What events have there been that prominently mention the string 'COP27'?

The data returned includes entities and topics involved, their prominence scores, alongside the number of stories describing the event (a measure of "newsworthiness"), and a group of most descriptive keywords and phrases.

## Preliminaries - A Minimal API Client

Below is a simple API client which we will use to abstract away the details of each individual API call. 
IF you're not interested in the details you can gloss over the code block below for now.

In [78]:
import requests
import json
import backoff
from urllib.parse import urlparse, parse_qs


def response_to_url(response) -> str:
    """get the url from a response"""
    obj = urlparse(response.request.url)
    return f"{obj.scheme}://{obj.netloc}{obj.path}"


def response_to_params(response) -> dict:
    """get the params from a response"""
    obj = urlparse(response.request.url)
    return parse_qs(obj.query)


def response_to_body(response) -> dict:
    body = response.request.body
    return json.loads(body.decode()) if body is not None else {}


def fatal_code(e):
    # too many requets - slow down
    if e.response.status_code == 429:
        return False
    # not authorised - re authenticate
    if e.response.status_code == 401:
        return False
    # Fatal
    if 400 <= e.response.status_code < 600:
        return True
    return False


class Paginate:
    """
    A class to iterate over API responses
    """

    def __init__(self, response):
        self.response = response

    def __iter__(self):
        return self

    def _get(self, response):
        """get the next get request from the previous one"""
        nxt = response.json().get("next-cursor", None)
        # the absence of next-cursor signifies we have reached the final page
        if not nxt:
            return None
        params = response_to_params(response)
        params["from-cursor"] = nxt
        return requests.request(
            "GET",
            response_to_url(response),
            headers=response.request.headers,
            params=params,
        )

    def _post(self, response):
        nxt = response.json().get("next-cursor", None)
        if not nxt:
            return None
        body = response_to_body(response)
        body["from-cursor"] = nxt
        return requests.request(
            "POST",
            response_to_url(response),
            headers=response.request.headers,
            json=body,
        )

    # retry requests with an exponentially increasing wait time upto 10 times
    @backoff.on_exception(
        backoff.expo,
        requests.exceptions.RequestException,
        max_value=10,
        giveup=fatal_code,
    )
    def __next__(self):
        """get the next response from the previous one"""
        response = self.response

        # Check if we have reached the final page
        if not response:
            raise StopIteration()

        # Check the latest response was valid
        response.raise_for_status()

        method = response.request.method
        if method == "GET":
            self.response = self._get(response)
        elif method == "POST":
            self.response = self._post(response)
        else:
            raise ValueError(f"{method} method not supported")

        return response


class Connection:
    def __init__(self, client_id, client_secret, url="https://api.signal-ai.com"):
        self._client_id = client_id
        self._client_secret = client_secret
        self._url = url
        self._temp_access_token = self._authenticate()

    def _authenticate(self):
        response = requests.post(
            f"{self._url}/auth/token",
            data={
                "grant_type": "client_credentials",
                "client_id": self._client_id,
                "client_secret": self._client_secret,
            },
        )
        return response.json().get("access_token")

    @backoff.on_exception(
        backoff.expo,
        requests.exceptions.RequestException,
        max_value=10,
        giveup=fatal_code,
    )
    def _request(self, method, endpoint, params=None, json=None):
        """Make get requests using a tempory access token"""
        response = requests.request(
            method,
            f"{self._url}/{endpoint}",
            params=params,
            json=json,
            headers={
                "Authorization": f"Bearer {self._temp_access_token}",
                "Content-Type": "application/json",
            },
        )

        # If the request is unauthorised try re-authenticating
        if response.status_code == 401:
            self._temp_access_token = self._authenticate()

        # Check the latest response was valid, if not raise an exception
        # and retry using backoff
        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            # attach API error messages to the exception
            raise requests.exceptions.HTTPError(
                "\n".join(
                    [str(e)] 
                    + [str(item) for item in e.response.json().get("errors", [])]
                    + [str(item) for item in e.response.json().get("message", [])]
                ),
                response=e.response,
                request=e.request,
            )
        return response

    def entities(self, params):
        """Find the signal ID for entities using any combination of name and type"""
        response = self._request("GET", "entities", params)
        for page in Paginate(response):
            entities = page.json().get("entities")
            # we need an extra check here because of an API pagination bug
            if not entities:
                break
            for item in entities:
                yield item

    def topics(self, params):
        """Find the signal ID for A.I. trained topics by name"""
        response = self._request("GET", "topics", params)
        for page in Paginate(response):
            for item in page.json().get("topics"):
                yield item

    def search(self, params):
        """Search for metadata about individual documents"""
        response = self._request("POST", "search", json=params)
        total = response.json().get("stats").get("total")
        # yield a lazy sequence. This could take a long time
        # and the user might not want all the results
        for page in Paginate(response):
            for item in page.json().get("documents", []):
                yield {
                    "document": item,
                    # include the length of the sequence
                    "stats": {"total": total},
                }

    def metrics(self, params):
        """aggregated metrics which can be sliced and diced along multiple dimensions:
        date, publication source, publication country, topics, entities, sentiment, etc.."""
        return self._request("POST", "metrics", json=params).json().get("aggregations")

    def events(self, params):
        """clusters of similar news coverage"""
        response = self._request("POST", "events", json=params)
        for event in response.json().get("events", []):
            yield event


## Using the Response From the Events API

Turn your attention to the sample response from the events API below, particularly the `story-ids` field.
The payload contains aggregate information about the event such as `story-count`, 
however, there is no data about the individual stories that make up the event except for the `story-ids` field. 

`story-ids` is a list, in order of relevance, that can be sent to the `/search` API in order to get any metadata about individual documents where each `story-id` represents a set of duplicate documents.

In the next cell we will write a function which takes a list of story IDs and returns the corresponding document metadata from the `/search` API. 

We'll see why this is useful for describing and understanding events in the examples below.


```json
[
    {
        "date": "2023-01-07",
        "source-count": 15,
        "entities": [
            {
                "prominence-score": 1.0,
                "name": "New York City ",
                "id": "a2654baa-86f8-4fe9-beae-913972b39162",
                "type": "location",
                "story-count": 19
            },
            {
                "prominence-score": 1.0,
                "name": "New York State",
                "id": "9998aba9-7ed6-49ff-877c-63a882aa7c35",
                "type": "location",
                "story-count": 19
            },
            {
                "prominence-score": 0.6842105263157895,
                "name": "New York (state)",
                "id": "b9d9d3ff-9e1a-348b-8013-dae5e5db7616",
                "type": "location",
                "story-count": 13
            }
        ],
        "topics": [
            {
                "prominence-score": 1.0,
                "name": "NFTs & Crypto Assets",
                "id": "0f14ce5f-1fe0-4275-8a02-ecfd9a0c4dae",
                "story-count": 19
            },
            {
                "prominence-score": 1.0,
                "name": "Financial Crime",
                "id": "7c6621b7-e556-414b-9b5a-1bf2c7478924",
                "story-count": 19
            },
            {
                "prominence-score": 0.9473684210526315,
                "name": "DeFi(or Decentralized Finance)",
                "id": "9bcccf59-8272-4022-8b55-04ccec36b9b2",
                "story-count": 18
            },
            {
                "prominence-score": 0.8421052631578947,
                "name": "Corporate Crime",
                "id": "d1dbd680-e749-421b-a669-82c8232ff258",
                "story-count": 16
            },
            {
                "prominence-score": 0.7894736842105263,
                "name": "Cryptocurrencies",
                "id": "24ad1eb0-d190-4130-a7e9-4e6a0e3af0d8",
                "story-count": 15
            },
            {
                "prominence-score": 0.631578947368421,
                "name": "Corrupt Fraud",
                "id": "e365d0a6-2ea3-4fdb-a33e-46ffee7390ea",
                "story-count": 12
            },
            {
                "prominence-score": 0.5789473684210527,
                "name": "Social Media",
                "id": "6410cb35-3fb0-49ee-ae27-bded0cc828f5",
                "story-count": 11
            }
        ],
        "story-ids": [
            "456f5b26-5593-482d-9e9f-b0d071b0b415",
            "0a288c11-3279-4129-8e39-399f04880027",
            "a62229f5-446f-44e5-86e0-f47f21fd4dae",
            "de5e9a71-8123-40e0-bceb-204ba7a515d7",
            "5549d1db-cbc5-4ba4-96ea-050bfb0cc4a9",
            "27007d6d-1fce-4de4-8697-100a4605e2dc",
            "f9979f89-7827-4feb-b2ff-ec70e0fe06de",
            "24e677a5-5ec9-42b5-9b39-5a970560904a",
            "6f02551f-fddc-41ff-9d8f-dff0a5b44aed",
            "cbe6fa77-cd6a-4acf-b2a1-d84ddde1f717",
            "420fd46a-3bb5-4923-98ef-45d902343557",
            "dfe22799-81c3-450c-9907-2c61f04b31af",
            "35bd4e17-db04-403d-8941-af3effd95536",
            "bd25755e-73f4-4d3e-856f-32e8d278667c",
            "9fd80495-beca-4134-a80f-4d5beb4e1162",
            "80cbbb20-b28d-4448-be32-cad1b4d6bf40",
            "26e9e86e-ca7c-4c91-85ea-045a3bb39dcc",
            "788c4020-716b-4a52-b8d5-d903f8281fc4",
            "ecb91e9a-18a6-4efb-8333-3dbb818f3edc"
        ],
        "story-count": 19,
        "labels": [
            "york",
            "arrested",
            "michel",
            "fraud",
            "aurélien",
            "nft",
            "frenchman",
            "nfts",
            "new",
            "scam",
            "new york",
            "aurélien michel"
        ]
    }
]
```

In [87]:
def search_by_story_id(story_ids: list, entity_ids: list, topic_ids: list, n=3):
    """
    return one document for each story-id in list "story_ids" 
    return the results in the same order as story_ids
    """
    # record the ordering of the story_ids
    rank = {story_id: rnk for rnk, story_id in enumerate(story_ids)}

    # query for the seach endpoint
    params = {
        'where': {
            'story-id': {
                'any': list(story_ids)
            },
        },
        'size': 500
    }

    # because each story ID resprents a set of duplicate or near duplicate documents
    # specifying the entities and topics can sometimes give slightly better results
    if entities:
        params['where']['entities'] = {
            'id': {'any': entity_ids},
            'salient-only': True
        }
        
    if topics:
        params['where']['topics'] = {
            'id': {'any': topic_ids}
        }

    # get all of the results from the search API
    response = (item['document'] for item in signal_api.search(params))

    # put the results in the correct order
    ordered = sorted(response, key=lambda document: rank[document['story-id']])

    # deduplicate the results
    grouped = islice(groupby(ordered, lambda x: x['story-id']), n)
    return [next(group) for _, group in grouped]

For example we can use the above function to get the most relevant headline about the sample event above.

In [101]:
documents = search_by_story_id(
    [
            "456f5b26-5593-482d-9e9f-b0d071b0b415",
            "0a288c11-3279-4129-8e39-399f04880027",
            "a62229f5-446f-44e5-86e0-f47f21fd4dae",
    ],
    ["a2654baa-86f8-4fe9-beae-913972b39162"],
    ["0f14ce5f-1fe0-4275-8a02-ecfd9a0c4dae"]
)
documents[0]['title']

'French man accused of NFT fraud arrested in New York'

## Examples

### Authenticate with API

Note that this call assumes you have stored your credentials as environment variables. 
You might need to edit this cell accordingly.

In [66]:
client_id = os.environ['SIGNAL_API_ID']
client_secret = os.environ['SIGNAL_API_SECRET']
signal_api = signalAPI.Connection(client_id=client_id, client_secret=client_secret)

### Choose a Date Range

We'll be needing a date range in the following examples. Let's just use the last 30 days.

In [89]:
end_date = pd.Period.now(freq='D')
start_date = str(end_date - 30)
end_date = str(end_date)
start_date, end_date

('2022-12-10', '2023-01-09')

### Example 1 - Search For Events About a Particular Topic
Let's search for events about Cryptocurrencies. We'll search for relevant topics using the topics endpoint.

In [68]:
# seach for topics with a name similar to "Crypto"
topics = {item['name']: item['id'] for item in signal_api.topics({'name': 'Crypto'})}
topics

{'Cryptocurrencies': '24ad1eb0-d190-4130-a7e9-4e6a0e3af0d8',
 'NFTs & Crypto Assets': '0f14ce5f-1fe0-4275-8a02-ecfd9a0c4dae'}

Below is the query we will need to send to the events API

In [94]:
query = {
    'where': {
        'date': {'gte': '2022-12-10', 'lte': '2023-01-09'},
        'topics': {
            'id': {
                'all': list(topics.values())
            }
        }
    }
}

In the next cell we call the events API, we then use the `search_by_story_id` function to get document level metadata.

In [95]:
response = signal_api.events(query)
# remove results that come from a small number of sources
response = (event for event in response if event['source-count'] >= (event['story-count'] / 5))
# take the first 10 events
events = list(response)

for event in tqdm(events):
    # for each event - you the search API to get metadata 
    # about individual documents such as headlines
    event['documents'] = search_by_story_id(
        event['story-ids'], 
        list(entities.values()), 
        list(topics.values())
    )

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.32it/s]


Display each event along with the top 3 headlines for each one. The headlines make it much easier to digest each event and asses the level of relevance, risk or opportunity.

In [96]:
pd.DataFrame([
    {
        'date': event['date'],
        'story-count': event['story-count'],
        'source-count': event['source-count'],
        'entities': [entity['name'] for entity in event['entities']],
        'topics': [topic['name'] for topic in event['topics']][:3], # just use the first 3 topics 
        'labels': event['labels'],
        'headlines': [doc['title'] for doc in event['documents']],
    }
    for event in events
])

Unnamed: 0,date,story-count,source-count,entities,topics,labels,headlines
0,2022-12-27,145,106,"[Metaverse, Fidelity Investments]","[Metaverse, NFTs & Crypto Assets, DeFi(or Decentralized Finance)]","[metaverse, fidelity, trademark, nft, services, applications, trademark applications, services metaverse]","[Fidelity plans NFT marketplace and financial services in the metaverse, Fidelity plans NFT marketplace and financial services in the Metaverse By Cointelegraph, Fidelity plans NFT marketplace and financial services in the Metaverse]"
1,2022-12-29,116,100,[China],"[DeFi(or Decentralized Finance), NFTs & Crypto Assets, Cryptocurrencies]","[nft, china, marketplace, digital, platform, launch, trading, asset, nft marketplace, trading platform, digital asset]","[China to launch state-owned NFT marketplace, China to Launch State-Owned NFT Marketplace, China state-backed NFT trading platform to launch on Jan. 1]"
2,2023-01-02,57,49,[Square Enix],"[DeFi(or Decentralized Finance), blockchain, Technology]","[enix, blockchain, square, matsuda, square enix]","[Square Enix: Publisher 'aggressively' investing in blockchain and NFT plans, Despite gamer hatred: Square Enix wants to continue making blockchain games, Square Enix Believes Blockchain Games are the Future]"
3,2022-12-29,55,51,[China],"[Engineering, blockchain, Digital Futures]","[digital, china, platform, trading, nft, marketplace, asset, launch, trading platform, digital asset, asset trading]","[The trend of NFT in China is about to change, The national team is here! China's digital asset trading platform will be launched soon, China launches first “national digital asset market”]"
4,2022-12-27,43,41,"[HSBC, Metaverse]","[Metaverse, NFTs & Crypto Assets, Market Regulation]","[hsbc, trademark, metaverse, applications, fidelity, trademark applications]","[Banking Giant HSBC Files Trademarks for a Wide Range of Digital Currency and Metaverse Products, Banking Giant HSBC Files Trademarks for a Wide Range of Digital Currency and Metaverse Products, Banking Giant HSBC Files Trademarks for a Wide Range of Digital Currency and Metaverse Products]"
5,2022-12-10,40,30,[Madonna (entertainer)],"[NFTs & Crypto Assets, DeFi(or Decentralized Finance), Corporate Controversy]","[madonna, yuga, bieber, nft, fallon, lawsuit, labs, ape, nfts, jimmy, bored, paltrow, yuga labs]","[Justin Bieber, The Weeknd and Madonna among defendants in Bored Ape Yacht Club lawsuit - Rolling Stone, Jimmy Fallon, Madonna Among Celeb NFT Endorsers Named in Suit Against Yuga Labs, Jimmy Fallon, Madonna Named Among Celeb NFT Endorsers in Suit Against Yuga Labs]"
6,2022-12-10,40,31,[Justin Bieber],"[NFTs & Crypto Assets, Corporate Controversy, DeFi(or Decentralized Finance)]","[yuga, bieber, bored, ape, justin, celebrities, lawsuit, nfts, labs, yacht, bored ape, justin bieber, yuga labs, ape yacht]","[Lawsuit Alleges Yuga Labs Conspired With Celebs Like Justin Bieber to Push Bored Ape NFTs, Justin Bieber, the Weeknd, and Snoop Dogg Among Celebrities Named in Bored Ape Yacht Club Lawsuit, Suit Makes Claims Justin Bieber and Other Celebrities Worked With Yuga Labs to Promote Boring Ape NFTs]"
7,2023-01-08,39,35,"[Ferrari, Formula 1]","[DeFi(or Decentralized Finance), Cryptocurrencies, NFTs & Crypto Assets]","[ferrari, velas, formula, sponsor, crypto, crypto sponsor, sponsor ahead]","[Ferrari cuts ties with crypto sponsor ahead of 2023 Formula One season, Ferrari cuts ties with crypto sponsor ahead of 2023 Formula 1 season, Ferrari cuts ties with crypto sponsor ahead of 2023 Formula One season - Cointelegraph]"
8,2022-12-23,31,30,[OpenSea],"[DeFi(or Decentralized Finance), NFTs & Crypto Assets, Hacking]","[opensea, nft, users, scam, threatens, nfts, gasless, harpie, phishing, auction, private auction, opensea users, auction scam, nft private, scam threatens, threatens opensea]","[New NFT private auction scam threatens OpenSea users, New NFT Private Auction Scam Threatens OpenSea Users, New NFT private auction scam threatens OpenSea users]"
9,2023-01-06,30,29,"[Shopify, Shopify (product), Shopify (corporate), Avalanche]","[NFTs & Crypto Assets, DeFi(or Decentralized Finance), eCommerce]","[shopify, avalanche, nfts, venly, merchants, nft, shopify merchants, avalanche nfts]","[Venly Makes Avalanche NFT Transactions Easier for Shopify Merchants, Avalanche NFT is Now Available to Shopify Merchants Via Venly App, AVAX Price Analysis: What's Next for Price? Avalanche NFTs Can Now Be Sold by Shopify Merchants Through Online Stores]"
