Adopted from GDELT Data Wrangle by James Houghton https://nbviewer.jupyter.org/github/JamesPHoughton/Published_Blog_Scripts/blob/master/GDELT%20Wrangler%20-%20Clean.ipynb

Additional GDELT resources: 
    
    GDELT library overview: https://colab.research.google.com/drive/1rnKEHKV1StOwGtFPsCctKDPTBB_kHOc_?usp=sharing 
    
    GDELT with big data: https://github.com/linwoodc3/gdeltPyR/wiki/Pulling-Large-GDELT-Data
        

# PART I: Get GDELT DATA FOR GUATEMALA


### Get the GDELT index files

In [1]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html') #Grab GDELT reference list which is by day
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href") #Returns all the possible CSV files of GDELT data as a references list

# separate out those links that begin with four digits 
'''
Will extract just the days resulting in list like: 
['20200617.export.CSV.zip',
 '20200616.export.CSV.zip',
 '20200615.export.CSV.zip',...]
 Until 2015
'''
#PIKE separate out those links that begin with four digists and retrieve only the ones starting on Sept 2020

file_list = [x for x in link_list if str.isdigit(x[0:4]) and x[0:6]>='202009']

In [2]:
#Counters to help assess how many files are coming and going out
infilecounter = 0
outfilecounter = 0

### Uses GDELT Index file list to download GDELT data for GUATEMALA

In [3]:
import os.path #To help navigate the file directories
import urllib #To request from GDELT
import zipfile #TO unzip the files we downlaod
import glob #To go through multiple files in a directory
import operator 

local_path = './results/' # Will save to empy results folder to help keep file clean

fips_country_code = 'GT'  ## !!!!! THIS IS THE GUATEMALA COUNTRY CODE GETS ONLY GUAEMALAN DATA!!!!

#Adjust list number to get days wanted 
for compressed_file in file_list[:7]: #!!!!!Only getting index 0 to 6!!!!!!
    print(compressed_file,)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,'),
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r', encoding="ISO-8859-1") as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done', infilecounter)
    

20201003.export.CSV.zip
extracting,
parsing,
done 1
20201002.export.CSV.zip
extracting,
parsing,
done 2
20201001.export.CSV.zip
extracting,
parsing,
done 3
20200930.export.CSV.zip
extracting,
parsing,
done 4
20200929.export.CSV.zip
extracting,
parsing,
done 5
20200928.export.CSV.zip
extracting,
parsing,
done 6
20200927.export.CSV.zip
extracting,
parsing,
done 7


# PART II:  PARSE DATA AGAIN

### Read in the data

In [4]:
import pandas as pd

# Get the GDELT field names from a helper file
colnames = pd.read_csv('CSV.header.fieldids.csv')['Field Name']


# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+fips_country_code+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID'], encoding='iso-8859-1'))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

./results\GT0000.tsv
./results\GT0001.tsv
./results\GT0002.tsv
./results\GT0003.tsv
./results\GT0004.tsv
./results\GT0005.tsv
./results\GT0006.tsv


In [5]:
import pickle

Guatemala_Data = pd.read_pickle(r"./results/backupGT.pickle")

### See top 5 lines of data

In [6]:
Guatemala_Data.head()

Unnamed: 0_level_0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
949990569,20201003,202010,2020,2020.7479,,,,,,,...,GT,1,Guatemala,GT,GT,15.5,-90.25,GT,20201003,http://www.digitaljournal.com/news/world/mexic...
949990977,20201003,202010,2020,2020.7479,BRA,BRAZIL,BRA,,,,...,HO,1,Guatemala,GT,GT,15.5,-90.25,GT,20201003,https://www.voazimbabwe.com/a/donald-trump-age...
949991975,20201003,202010,2020,2020.7479,GTM,GUATEMALAN,GTM,,,,...,,1,Guatemala,GT,GT,15.5,-90.25,GT,20201003,http://www.digitaljournal.com/news/world/mexic...
949992032,20201003,202010,2020,2020.7479,HND,HONDURAN,HND,,,,...,531871,1,Guatemala,GT,GT,15.5,-90.25,GT,20201003,http://www.digitaljournal.com/news/world/mexic...
949992033,20201003,202010,2020,2020.7479,HND,HONDURAN,HND,,,,...,GT,1,Guatemala,GT,GT,15.5,-90.25,GT,20201003,http://www.digitaljournal.com/news/world/mexic...


### Helper Function  to turn codebooks  into look up tables

In [7]:
def ref_dict(df):
    cols = list(df)
    ref_dict = {}
    for row in df.iterrows(): 
        ref_dict[row[1][cols[0]]] = row[1][cols[1]]
    
    return ref_dict

### Convert each codebook and store in object

In [8]:
#Read in event codes
eventCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.eventcodes.txt", sep='\t'))
#Read in Goldsteinscale
goldScale = ref_dict(pd.read_csv("./Ref Codes/CAMEO.goldsteinscale.txt", sep='\t'))
#Read in ethnic groups
ethnicCodes =ref_dict(pd.read_csv("./Ref Codes/CAMEO.ethnic.txt", sep='\t'))
#Read in known Groups
knownGroups = ref_dict(pd.read_csv("./Ref Codes/CAMEO.knowngroup.txt", sep='\t'))
#Read in relgion
religionCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.religion.txt", sep='\t'))
#Read in type
typeCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.type.txt", sep='\t'))

In [9]:
# Turn colnames into list for ref
cross_ref = list(colnames)

In [10]:
# Create look up table to get values instead of numbers

look_up_code = {"eventCodes": [26,27,28], "goldScale":[30], "ethnicCodes":[9,19], "knownGroups":[8,18], 
                "religionCodes":[10,11,20,21], "typeCodes":[12,13,14,22,23,24]}

In [11]:
'''
Helper function to user can reorient data based on interest from codes

data: Guatemala_Data - pandas dataframe
ref: key value from look_look_code - string
codebook: reference 
'''

import math

def search_dict(data,ref, codebook):
    res = {}
    look_up = look_up_code[ref]
    col_names = []
    for i in look_up: 
        col_names.append(cross_ref[i])
    
    for col in col_names: 
        for row in data.iterrows(): 
            if isinstance(row[1][col],float):
                #print (type(row[1][col]), col)
                pass
            else: 
                #print (col)
                var = codebook[row[1][col]].upper()
                #print (var, row[1][col])
                if var in res.keys(): 
                    #print(row[1][col])
                    res[var].append(dict(row[1]))
                else: 
                    res[var] = [dict(row[1])]
    return res


In [12]:
res = search_dict(Guatemala_Data, "ethnicCodes", ethnicCodes)
res.keys()

dict_keys(['MAASAI', 'PAPEL', 'INDIGENOUS'])

In [13]:
#verfication to ensure code is working properly
for k,v in res.items(): 
    print (k, ": ", len(v))

MAASAI :  1
PAPEL :  1
INDIGENOUS :  5


In [14]:
#Put each collection of articles in a Dataframe
list_res = []

for cat in res.values(): 
    #print(cat)
    list_res.append(pd.DataFrame(cat))

In [15]:
list_res[0:] #PIKE access all the groups that generates news during the last 7 days

[    SQLDATE MonthYear  Year FractionDate Actor1Code Actor1Name  \
 0  20201002    202010  2020    2020.7452        mas     MAASAI   
 
    Actor1CountryCode  Actor1KnownGroupCode Actor1EthnicCode  \
 0                NaN                   NaN              mas   
 
    Actor1Religion1Code  ...  Actor2Geo_FeatureID  ActionGeo_Type  \
 0                  NaN  ...                  NaN               4   
 
           ActionGeo_FullName  ActionGeo_CountryCode  ActionGeo_ADM1Code  \
 0  Ceibo, PetÃ©GT, Guatemala                     GT                GT12   
 
    ActionGeo_Lat  ActionGeo_Long  ActionGeo_FeatureID  DATEADDED  \
 0        17.2472        -90.9901             11334374   20201002   
 
                                            SOURCEURL  
 0  https://www.globalcitizen.org/en/content/equat...  
 
 [1 rows x 57 columns],
     SQLDATE MonthYear  Year FractionDate Actor1Code Actor1Name  \
 0  20200930    202009  2020    2020.7397        ppl      PAPEL   
 
    Actor1CountryCode  Act

In [16]:
#PIKE Checking how many Geo Coords are on each ethnic group, in this case for the second ethnbic group record
list_res[2]["ActionGeo_Long"][0]
length = len(list_res[2]["ActionGeo_Long"])
length

5

In [17]:
#PIKE Preparing the Final Summary for the reader in terms of how many news were reported from this country in the last 7 dyas
# which group was affected and the source.

i=0
n=0
date = []
length1 = len(list_res)
while i < length1:
    n=0
    length2 = len(list_res[i]["SQLDATE"])
    while n < length2:
        date.append(list_res[i]["SQLDATE"][n])
        n=n+1
    i=i+1
i=0
n=0
actor = []
length1 = len(list_res)
while i < length1:
    n=0
    length2 = len(list_res[i]["Actor1Name"])
    while n < length2:
        actor.append(list_res[i]["Actor1Name"][n])
        n=n+1
    i=i+1
i=0
n=0
source = []
length1 = len(list_res)
while i < length1:
    n=0
    length2 = len(list_res[i]["SOURCEURL"])
    while n < length2:
        source.append(list_res[i]["SOURCEURL"][n])
        n=n+1
    i=i+1


In [18]:
#PIKE Exctracting the Geo Coords of each event
i=0
n=0
long = []
length1 = len(list_res)
while i < length1:
    n=0
    length2 = len(list_res[i]["ActionGeo_Long"])
    while n < length2:
        long.append(list_res[i]["ActionGeo_Long"][n])
        n=n+1
    i=i+1

longi=[]
for item in long:
    longi.append(float(item))


longi

[-90.9901, -90.25, -90.8061, -90.25, -90.25, -90.25, -90.25]

In [31]:
#PIKE Exctracting the Geo Coords of each event
i=0
n=0
lat = []
length1 = len(list_res)
while i < length1:
    n=0
    length2 = len(list_res[i]["ActionGeo_Lat"])
    while n < length2:
        lat.append(list_res[i]["ActionGeo_Lat"][n])
        n=n+1
    i=i+1

lati=[]
for item in lat:
    lati.append(float(item))

lati

[17.2472, 15.5, 14.995, 15.5, 15.5, 15.5, 15.5]

In [88]:
### Homework 4: Do some type of analysis with GDELT data. It can be country focused (e.g. Guatemala) or topic focused (e.g. attacks or bilateral agreements)

### Must write in the first cell what you are interested in. Code must work but results can be garabage. Update the GDELT parameters to get the information you want and then include some type of plot can be a graph or can be a map.  

### Total Points Possible 19

In [32]:
# PIKE Mapping the events
!pip install pyproj
from bokeh.tile_providers import get_provider, Vendors
from pyproj import Transformer
from bokeh.plotting import figure, output_notebook, show, output_file #builds interactive graphs for python
from bokeh.models import Range1d
import math #this is used in graphic section to use the irrational number pi
output_notebook() #Allows inline plotting for Juptyer notebook
title_provider = get_provider('STAMEN_TERRAIN')



In [33]:
#PIKE Copy the Transformer for changing from regular lat and longs into mercator points
transformer = Transformer.from_crs('epsg:4326','epsg:3857')

In [34]:
#PIKE Defining Guatemala geographic box
pts = [(18, -93), (13, -88)]
bbox = []
for pt in transformer.itransform(pts): 
    bbox.append(pt)

In [35]:
#PIKE Transforming the Geo coods from the GDELT dataset to mercator for proper mapping
i = 0
length = len(longi)

while i < length:
    point = transformer.transform(lati,longi)
    i=i+1

In [36]:
#Plots the bounding box
p = figure(x_range=(bbox[0][0], bbox[1][0]),y_range=(bbox[0][1], bbox[1][1]),x_axis_type="mercator", y_axis_type="mercator", title="POLITICAL EVENTS IN GUATEMALA DURING THE LAST 7 DAYS")
#add the map form the Bokeh map vendor in this case Stamen_Terrain --- see documentation
p.add_tile(title_provider)
# Places a circle for each converted lat/long attack 
p.circle(x = point[0], y = point[1], color= "#3288bd",line_color="firebrick", line_width=10)
#shows the plot
show(p)

In [37]:
#PIKE Presenting the Summary of the events happened in this country for the last 7 days
event=[]
event = [list(x) for x in zip(date, actor, source)]
event
event = pd.DataFrame(event)
event.columns=["DATE", "IMPACTED GROUP", "SOURCES"]
event.style.set_caption("POLITICAL EVENTS IN GUATEMALA IN THE LAST 7 DAYS")
event.sort_values(by=['DATE'], inplace=True, ascending=False)
event

Unnamed: 0,DATE,IMPACTED GROUP,SOURCES
0,20201002,MAASAI,https://www.globalcitizen.org/en/content/equat...
4,20201002,MILITARY,https://www.thespec.com/ts/business/2020/10/01...
5,20201001,MILITARY,https://www.stcatharinesstandard.ca/ts/busines...
1,20200930,PAPEL,https://advancedbiofuelsusa.info/tag/renovabio/
6,20200930,GOVERNMENT,https://richmond.com/special-report/coronaviru...
2,20200929,INDIGENOUS,https://www.aljazeera.com/news/2020/04/16/fear...
3,20200928,INDIGENOUS,https://www.ipsnews.net/2020/09/womens-leaders...
