#### Start session with Event Registry API

In [1]:
from eventregistry import *

In [2]:
er = EventRegistry(apiKey='4b870e07-3dd9-49bd-aa0b-bac502c384bb')

using user provided API key for making requests
Event Registry host: http://eventregistry.org
Text analytics host: http://analytics.eventregistry.org


#### Specify the data range for the queries

In [3]:
import datetime

In [4]:
end_date = datetime.date(2019, 3, 9)
start_date = end_date - datetime.timedelta(days=30)

#### Specify the informations required for each article

In [5]:
output = ArticleInfoFlags(bodyLen=-1,
                          basicInfo=True,
                          title=True,
                          body=True,
                          location=True,
                          url=False,
                          eventUri=False,
                          authors=False,
                          concepts=False,
                          categories=True,
                          links=False,
                          videos=False,
                          image=False,
                          sentiment=False,
                          dates=False,
                          extractedDates=False,
                          duplicateList=False,
                          originalArticle=False,
                          storyUri=False)

#### Specify the queries to run

In [6]:
q_sports = QueryArticlesIter(dateStart=start_date, 
                             dateEnd=end_date,
                             conceptUri=er.getConceptUri('sports'),
                             categoryUri=er.getCategoryUri('sports'),
                             lang='eng',
                             isDuplicateFilter='skipDuplicates')

In [7]:
q_politics = QueryArticlesIter(dateStart=start_date, 
                               dateEnd=end_date,
                               conceptUri=er.getConceptUri('politics'),
                               categoryUri=er.getCategoryUri('politics'),
                               lang='eng',
                               isDuplicateFilter='skipDuplicates')

In [8]:
q_other = QueryArticlesIter(dateStart=start_date, 
                            dateEnd=end_date,
                            ignoreConceptUri=QueryItems.OR([er.getConceptUri('politics'), er.getConceptUri('sports')]),
                            ignoreCategoryUri=QueryItems.OR([er.getCategoryUri('politics'), er.getCategoryUri('sports')]),
                            lang='eng',
                            isDuplicateFilter='skipDuplicates')

#### Turn the query results into a pandas dataframes

In [9]:
import utils

In [10]:
import importlib

In [11]:
importlib.reload(utils)

<module 'utils' from '/Users/qthurier/Desktop/perso/phq_project/notebooks/utils.py'>

In [15]:
%time sports_article_df = utils.build_df_from_query(q_sports, er, 10000, ReturnInfo(articleInfo=output))

CPU times: user 1.03 s, sys: 105 ms, total: 1.14 s
Wall time: 1min 41s


In [16]:
%time politics_article_df = utils.build_df_from_query(q_politics, er, 10000, ReturnInfo(articleInfo=output))

CPU times: user 1.25 s, sys: 117 ms, total: 1.36 s
Wall time: 4min 13s


In [12]:
%time other_article_df = utils.build_df_from_query(q_other, er, 10000, ReturnInfo(articleInfo=output))

CPU times: user 861 ms, sys: 83.8 ms, total: 945 ms
Wall time: 12min 42s


#### Save dataframes to csv files

In [13]:
sports_article_df.to_csv('../data/sports_10k.csv', index=False)
politics_article_df.to_csv('../data/politics_10k.csv', index=False)
other_article_df.to_csv('../data/other_10k.csv', index=False)