In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read all mentions
base_path = "./../../data/gdelt/mentions/"
day = "20181105"

mins = ['00', '15', '30', '45']

mention_dataframes = []
for h in ["%.2d" % i for i in range(24)]:
    for m in mins:
        df = pd.read_json(base_path + day + h + m + '00.json', orient='records')
        mention_dataframes.append(df)

mentions = pd.concat(mention_dataframes, axis = 0, ignore_index = True)

In [3]:
mentions.head(5)

Unnamed: 0,EventTimeDate,GLOBALEVENTID,MentionIdentifier,MentionSourceName,MentionTimeDate,MentionType
0,20181105000000,800011820,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1
1,20181105000000,800011821,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1
2,20181105000000,800011822,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1
3,20181105000000,800011823,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1
4,20181105000000,800011824,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1


In [4]:
by_source = mentions[['MentionSourceName', 'MentionTimeDate']].groupby('MentionSourceName').count()
by_source.columns = ['MentionsCount']
by_source.sort_values('MentionsCount', ascending=False, inplace=True)

In [5]:
# Some analytics about mentions
print("Total mentions in original day : ", mentions.shape[0])
print("Total sources in original day : ", by_source.shape[0])

Total mentions in original day :  359058
Total sources in original day :  7367


In [6]:
# Filter mentions by sources above threshold
keptSources = pd.Series(by_source[by_source.MentionsCount >= 25].index)
keptMentions = mentions[mentions.MentionSourceName.isin(keptSources)]

In [7]:
# Some analytics about mentions
print("Total mentions in original day : ", keptMentions.shape[0])
print("Total sources in original day : ", keptSources.size)

Total mentions in original day :  325877
Total sources in original day :  2709


In [8]:
keptMentions.head(3)

Unnamed: 0,EventTimeDate,GLOBALEVENTID,MentionIdentifier,MentionSourceName,MentionTimeDate,MentionType
0,20181105000000,800011820,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1
1,20181105000000,800011821,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1
2,20181105000000,800011822,https://www.stamfordadvocate.com/news/world/ar...,stamfordadvocate.com,20181105000000,1


In [11]:
# Analyze events now
by_event = keptMentions[['GLOBALEVENTID', 'MentionIdentifier']].groupby('GLOBALEVENTID').count()
by_event.columns = ['EventCount']
by_event.sort_values('EventCount', ascending=False, inplace=True)

In [19]:
keptEvents = pd.Series(by_event[by_event.EventCount > 1].index)

In [21]:
keptEvents.to_csv("./../../data/gdelt/keptEvents.csv", encoding='utf-8')

In [22]:
keptSources.to_csv("./../../data/gdelt/keptSources.csv", encoding='utf-8')

In [27]:
test = pd.read_csv("./../../data/gdelt/keptEvents.csv", encoding='utf-8', header=None, names=['ID'])

In [29]:
keptEvents

0        800045540
1        800046143
2        800046045
3        800046140
4        800046049
5        800045545
6        800045371
7        800016943
8        800016942
9        800064086
10       800064085
11       800045408
12       800045370
13       800065238
14       800076747
15       800020569
16       800064038
17       800064039
18       800064042
19       800034033
20       800034109
21       800040140
22       800064046
23       800039543
24       800079398
25       800155492
26       800014574
27       800014571
28       800013639
29       800019320
           ...    
27867    800238932
27868    800012901
27869    800186095
27870    800186151
27871    800169260
27872    800142446
27873    800012986
27874    800169248
27875    800244771
27876    800269667
27877    800018789
27878    800167019
27879    800064121
27880    800019084
27881    800064007
27882    800169247
27883    800184217
27884    800184358
27885    800142445
27886    800169188
27887    800142599
27888    800