# Description:
In this notebook we explore the ODFE instance where we keep our news documents.

In [1]:
from datetime import datetime
import pandas as pd

from elasticsearch import Elasticsearch

from bokeh.plotting import figure, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook
output_notebook()

import warnings
warnings.filterwarnings('ignore')

In [2]:
def compact_doc(doc):
    """Compacts the documents displayed in the notebook
    """
    doc['embedding'] = doc['embedding'][:3]
    doc['embedding'].append("...")
    return doc

In [3]:
# Connecting to Elasticsearch
es = Elasticsearch(
    hosts=['odfe-node1', '0.0.0.0'],
    http_auth=('admin', 'admin'),
    scheme="https",
    verify_certs=False
)

# Exploratory Data Analysis of document collection

## Indices
The document store is composed of indices. These indices in turn hold documents.

In [4]:
# List indices
es.indices.get_alias("*")

{'security-auditlog-2021.09.05': {'aliases': {}},
 'label': {'aliases': {}},
 '.opendistro_security': {'aliases': {}},
 'document': {'aliases': {}}}

## Documents

How many documents do we have in the document index?

In [5]:
# How many documents does the 'document' collection hold?
es.indices.refresh('document')
count = es.cat.count('document', params={"format": "json"})[0]['count']
print(f"There are {count} documents in the index 'document'")

There are 334925 documents in the index 'document'


## Topic label

Each document has a topic. What is the frequency of each topic?

In [6]:
# Get counts of topic_labels
result = es.search(
    {
        "size": 0,
         "aggs": {
             "group_by_topic": {
                 "terms": {
                     "field": "topic_label",
                     "size": 21
                 }
             }
         }
    },
    index="document"
)['aggregations']['group_by_topic']['buckets']
counts = dict(list(map(lambda x: list(x.values()), result)))
counts

{'-1_covid_19_covid 19_coronavirus': 156885,
 '1_new_netflix_series_movie': 21888,
 '0_nfl_football_team_coach': 18280,
 '3_vaccine_covid_covid 19_19 vaccine': 15770,
 '2_trump_president_election_donald trump': 14538,
 '5_mlb_mets_baseball_dodgers': 13645,
 '4_nba_lakers_trade_warriors': 9135,
 '6_stocks_stock_deals_market': 8909,
 '7_galaxy_nasa_samsung_spacex': 7932,
 '11_juventus_champions_champions league_roma': 7478,
 '9_tesla_electric_airpods_car': 7254,
 '8_entire post_view entire_app_johnson': 7177,
 '10_biden_president_joe_joe biden': 7012,
 '12_game_warzone_black ops_destiny': 6054,
 '14_apple_iphone_rtx_pro': 5290,
 '15_covid_covid 19_county_deaths': 5266,
 '13_police_died_shooting_killed': 5089,
 '17_nintendo_ps5_xbox_cyberpunk': 4809,
 '16_google_tiktok_app_chrome': 4623,
 '18_ufc_wwe_wrestlemania_wrestling': 4212,
 '19_bitcoin_cryptocurrency_crypto_coinbase': 3679}

## Get random document
Getting a truly random document from the collection

In [7]:
# Print random document
result = es.search(
    {
        "size": 1,
        "query": {
            "function_score": {
                "functions": [
                    {
                        "random_score": {
                            "seed": "1477072619038"
                        }
                    }
                ]
            }
        }
    },
    index="document"
)['hits']['hits'][0]
print(f"Document ID: {result['_id']}", "\n", compact_doc(result['_source']))

Document ID: 90da4ab6-b866-4966-9f14-f7b7fc1298fa 
 {'text': 'COLUMN-Fed focus on jobs implies significant inflation overshoot: Kemp - Reuters#SEPTAG#The U.S. Federal Reserve’s determination to stimulate the economy through low interest rates and bond buying until employment returns to pre-epidemic levels will likely raise commodity prices significantly in the interim.#SEPTAG#(John Kemp is a Reuters market analyst. The views expressed are his own)* Chartbook: tmsnrt.rs/3yoc7fULONDON, May 18 (Reuters) - The U.S. Federal Reserves determination to stimulate the economy t', 'embedding': [-0.1872081756591797, -0.5725518465042114, -0.3096131384372711, '...'], 'source': 'reuters', 'publishedat': '2021-05-18T13:47:00Z', 'url': 'https://www.reuters.com/article/usa-inflation-kemp-idUSL5N2N54DG', 'urltoimage': 'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?w=800', 'category': 'general', 'umap_embeddings': [-0.5174710750579834, -1.515263557434082], 'topic_number': -1, 'topic_lab

## Published date
Getting the documents obeying to a date range

In [8]:
# Get number of documents between two dates
query = {
    "query": {
        "range": {
            "publishedat": {
                "gte": "2020-01-01",
                "lte": "2021-01-01"
            }
        }
    }
}

count = es.count(query, index="document")['count']

result = es.search(query, index="document")['hits']['hits']

print(f"There are {count} documents in the search result. An example of these documents:\n")
print(f"Document ID: {result[0]['_id']}", "\n", compact_doc(result[0]['_source']))

There are 114348 documents in the search result. An example of these documents:

Document ID: 00c05f65-aa2f-4cf9-8f49-8102e6f7c1aa 
 {'text': "LSU film room: Inside LSU's defensive third downs that helped stop Arkansas - The Advocate#SEPTAG#FAYETTEVILLE, Ark. — Welcome to Film Room, where we'll break down significant portions from LSU's last football game.#SEPTAG#FAYETTEVILLE, Ark.\xa0\xa0Welcome to Film Room, where we'll break down significant portions from LSU's last football game.(Click to enlarge photos)LSU 27, Arkansas 24How It HappenedThird down stops", 'embedding': [-0.3853817284107208, -0.00019888236420229077, -0.6122472882270813, '...'], 'source': None, 'publishedat': '2020-11-23T01:00:00Z', 'url': 'https://www.theadvocate.com/baton_rouge/sports/lsu/article_cb38a048-2d08-11eb-ab69-d355dfc97db0.html', 'urltoimage': 'https://bloximages.newyork1.vip.townnews.com/theadvocate.com/content/tncms/assets/v3/editorial/1/40/1409b0ce-2d26-11eb-8807-23edf6b8cb23/5fbb07a94cd58.image.jpg?res

In [9]:
# Get most recent documents
result = es.search(
    {
        "size": 10,
        "sort": {
            "publishedat": "desc"
        },
        "query": {
            "match_all": {}
        }
    },
    index="document"
)['hits']['hits']
print("The most recent document is: \n")
print(f"Document ID: {result[0]['_id']}", "\n", compact_doc(result[0]['_source']))

The most recent document is: 

Document ID: 82e13d05-4205-4461-b0e7-199d397f414a 
 {'text': 'News24.com | No evidence of corruption was provided - IPP office on challenge to R218bn Karpowership deal#SEPTAG#Authorities have said a legal challenge by DNG Energy to the awarding of a power supply contract worth an estimated R218 billion to Turkey\'s Karpowership was "without merit" and "self serving".#SEPTAG#The head of SA\'s Independent Power Producer Procurement (IPP) Programme Office said a rival\'s challenge to the awarding of a R218bn contract to Karpowership was \'without merit\'DNG ha', 'embedding': [-0.11433468014001846, 0.14669691026210785, 0.017174866050481796, '...'], 'source': 'news24', 'publishedat': '2021-06-02T06:09:55Z', 'url': 'https://www.news24.com/fin24/Economy/no-evidence-of-corruption-was-provided-ipp-office-on-challenge-to-r218bn-karpowership-deal-20210602', 'urltoimage': None, 'category': 'general', 'umap_embeddings': [-0.720394492149353, -0.21168778836727142], 'top

In [10]:
# Get oldest documents
result = es.search(
    {
        "size": 10,
        "sort": {
            "publishedat": "asc"
        },
        "query": {
            "match_all": {}
        }
    },
    index="document"
)['hits']['hits']
print("The oldest document is: \n")
print(f"Document ID: {result[0]['_id']}", "\n", compact_doc(result[0]['_source']))

The oldest document is: 

Document ID: 8c5706b0-15b4-4bad-98f1-2d8d1c146b27 
 {'text': '60 Minutes (Official Site) Watch on CBS - cbs.com#SEPTAG#Offering hard-hitting investigative reports, interviews, feature segments, and profiles of people in the news, the CBS News magazine has been the number-one program a record five times.#SEPTAG#S53 E283/28/2021: What Happened in Wuhan?, Dynamic Robots, Kindred in the BleachersFinding the origin of the coronavirus; Then, robots of the future at Boston Dynamics; And, Dave Kindred goes back ', 'embedding': [-0.22366869449615479, -0.5511829257011414, -0.23397429287433624, '...'], 'source': None, 'publishedat': '2012-07-25T19:09:34Z', 'url': 'https://www.cbs.com/shows/60_minutes/', 'urltoimage': 'https://wwwimage-tve.cbsstatic.com/base/files/seo/cbs-social_19.jpg', 'category': 'technology', 'umap_embeddings': [3.328604221343994, -1.3340216875076294], 'topic_number': -1, 'topic_label': '-1_covid_19_covid 19_coronavirus'}


In [11]:
# Plot number of documents per publishedAt day
result = es.search(
    {
        "size": 0,
        "aggs": {
            "news_per_day": {
                "date_histogram": {
                    "field": "publishedat",
                    "calendar_interval": "day",
                    "format": "yyyy-MM-dd"
                }
            }
        }
    },
    index="document"
)['aggregations']['news_per_day']['buckets']

# data
counts = list(map(lambda x: x['doc_count'], result))
time = list(map(lambda x: datetime.strptime(x['key_as_string'], '%Y-%m-%d'), result))

# hover tool
hover_tool = HoverTool(
    tooltips=[
        ('Day', '@x{%F}'),
        ('Number of documents', '@y'),
    ],
    formatters={'@x': 'datetime'},
    mode='vline'
)

# figure and options
p = figure(
    x_axis_type="datetime", 
    title="Number of Documents over time", 
    plot_height=350, 
    plot_width=800,
    tools=[hover_tool, 'pan', 'box_zoom', 'reset', 'wheel_zoom']
)
p.ygrid.grid_line_alpha=0.5
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Number of Documents'

# plot
p.line(time, counts)

show(p)

## Source
Where do news articles come from?

In [13]:
# Plot number of documents per source
result = es.search(
    {
        "size": 0,
         "aggs": {
             "group_by_source": {
                 "terms": {
                     "field": "source",
                     "size": 50
                 }
             }
         }
    },
    index="document"
)['aggregations']['group_by_source']['buckets']
counts = dict(list(map(lambda x: list(x.values()), result)))
counts

{'business-insider': 23995,
 'bleacher-report': 15635,
 'techradar': 10137,
 'techcrunch': 9523,
 'buzzfeed': 8991,
 'bloomberg': 8860,
 'the-verge': 8244,
 'reuters': 7422,
 'cbs-news': 7094,
 'usa-today': 6999,
 'fox-news': 6994,
 'engadget': 6942,
 'mashable': 6225,
 'independent': 6054,
 'football-italia': 5061,
 'the-irish-times': 4999,
 'the-times-of-india': 4754,
 'the-hindu': 4283,
 'the-wall-street-journal': 4184,
 'polygon': 3895,
 'ign': 3888,
 'cnn': 3406,
 'ars-technica': 3231,
 'the-hill': 3037,
 'fox-sports': 2900,
 'abc-news': 2546,
 'wired': 2349,
 'nbc-news': 2347,
 'associated-press': 2342,
 'the-next-web': 2232,
 'newsweek': 2197,
 'bbc-news': 2134,
 'the-washington-post': 2074,
 'cbc-news': 2020,
 'espn-cric-info': 1960,
 'news24': 1850,
 'abc-news-au': 1709,
 'new-scientist': 1640,
 'entertainment-weekly': 1407,
 'rte': 1047,
 'al-jazeera-english': 1023,
 'politico': 1010,
 'hacker-news': 860,
 'financial-post': 800,
 'vice-news': 735,
 'next-big-future': 592,
 'a