Adopted from GDELT Data Wrangle by James Houghton https://nbviewer.jupyter.org/github/JamesPHoughton/Published_Blog_Scripts/blob/master/GDELT%20Wrangler%20-%20Clean.ipynb

Additional GDELT resources: 
    
    GDELT library overview: https://colab.research.google.com/drive/1rnKEHKV1StOwGtFPsCctKDPTBB_kHOc_?usp=sharing 
    
    GDELT with big data: https://github.com/linwoodc3/gdeltPyR/wiki/Pulling-Large-GDELT-Data
        

# PART I: Get GDELT DATA FOR NIGER


### Get the GDELT index files

In [1]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html') #Grab GDELT reference list which is by day
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href") #Returns all the possible CSV files of GDELT data as a references list

# separate out those links that begin with four digits 
'''
Will extract just the days resulting in list like: 
['20200617.export.CSV.zip',
 '20200616.export.CSV.zip',
 '20200615.export.CSV.zip',...]
 Until 2015
'''

file_list = [x for x in link_list if str.isdigit(x[0:4])]
file_list

['20201005.export.CSV.zip',
 '20201004.export.CSV.zip',
 '20201003.export.CSV.zip',
 '20201002.export.CSV.zip',
 '20201001.export.CSV.zip',
 '20200930.export.CSV.zip',
 '20200929.export.CSV.zip',
 '20200928.export.CSV.zip',
 '20200927.export.CSV.zip',
 '20200926.export.CSV.zip',
 '20200925.export.CSV.zip',
 '20200924.export.CSV.zip',
 '20200923.export.CSV.zip',
 '20200922.export.CSV.zip',
 '20200921.export.CSV.zip',
 '20200920.export.CSV.zip',
 '20200919.export.CSV.zip',
 '20200918.export.CSV.zip',
 '20200917.export.CSV.zip',
 '20200916.export.CSV.zip',
 '20200915.export.CSV.zip',
 '20200914.export.CSV.zip',
 '20200913.export.CSV.zip',
 '20200912.export.CSV.zip',
 '20200911.export.CSV.zip',
 '20200910.export.CSV.zip',
 '20200909.export.CSV.zip',
 '20200908.export.CSV.zip',
 '20200907.export.CSV.zip',
 '20200906.export.CSV.zip',
 '20200905.export.CSV.zip',
 '20200904.export.CSV.zip',
 '20200903.export.CSV.zip',
 '20200902.export.CSV.zip',
 '20200901.export.CSV.zip',
 '20200831.export.CS

In [2]:
#Counters to help assess how many files are coming and going out
infilecounter = 0
outfilecounter = 0

### Uses GDELT Index file list to download GDELT data for that day for that country

In [4]:
import os.path #To help navigate the file directories
import urllib #To request from GDELT
import zipfile #TO unzip the files we downlaod
import glob #To go through multiple files in a directory
import operator 

local_path = './results/' # Will save to empy results folder to help keep file clean

fips_country_code = 'NG'  ## !!!!! THIS IS THE NIGER COUNTRY CODE GETS ONLY NIGER DATA!!!!

#Adjust list number to get days wanted 
for compressed_file in file_list[:7]: #!!!!!Only getting index 0 to 6!!!!!!
    print(compressed_file,)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,'),
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r', encoding="ISO-8859-1") as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done', infilecounter)
    

20201005.export.CSV.zip
extracting,
parsing,
done 8
20201004.export.CSV.zip
extracting,
parsing,
done 9
20201003.export.CSV.zip
extracting,
parsing,
done 10
20201002.export.CSV.zip
extracting,
parsing,
done 11
20201001.export.CSV.zip
extracting,
parsing,
done 12
20200930.export.CSV.zip
extracting,
parsing,
done 13
20200929.export.CSV.zip
extracting,
parsing,
done 14


# PART II:  PARSE DATA AGAIN

### Read in the data

In [5]:
import pandas as pd

# Get the GDELT field names from a helper file
colnames = pd.read_csv('CSV.header.fieldids.csv')['Field Name']


# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+fips_country_code+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID'], encoding='iso-8859-1'))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

./results/NG0000.tsv
./results/NG0001.tsv
./results/NG0002.tsv
./results/NG0003.tsv
./results/NG0004.tsv
./results/NG0005.tsv
./results/NG0006.tsv
./results/NG0007.tsv
./results/NG0008.tsv
./results/NG0009.tsv
./results/NG0010.tsv
./results/NG0011.tsv
./results/NG0012.tsv
./results/NG0013.tsv


In [6]:
import pickle

Niger_Data = pd.read_pickle(r"./results/backupNG.pickle")

### See top 5 lines of data

In [7]:
Niger_Data.head()

Unnamed: 0_level_0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
950198764,20201005,202010,2020,2020.7534,EDU,SCHOOL,,,,,...,,1,Niger,NG,NG,16,8,NG,20201005,https://thenationonlineng.net/niger-private-sc...
950199228,20201005,202010,2020,2020.7534,MED,WEBSITE,,,,,...,,1,Niger,NG,NG,16,8,NG,20201005,https://punchng.com/matawalle-has-rescued-11-k...
950199307,20201005,202010,2020,2020.7534,NER,NIGER,NER,,,,...,,1,Niger,NG,NG,16,8,NG,20201005,https://punchng.com/matawalle-has-rescued-11-k...
950199308,20201005,202010,2020,2020.7534,NER,NIGER,NER,,,,...,,1,Niger,NG,NG,16,8,NG,20201005,https://punchng.com/matawalle-has-rescued-11-k...
950217757,20201005,202010,2020,2020.7534,GOV,GOVERNMENT,,,,,...,,1,Niger,NG,NG,16,8,NG,20201005,https://www.vanguardngr.com/2020/10/rememberin...


### Helper Function  to turn codebooks  into look up tables

In [8]:
def ref_dict(df):
    cols = list(df)
    ref_dict = {}
    for row in df.iterrows(): 
        ref_dict[row[1][cols[0]]] = row[1][cols[1]]
    
    return ref_dict

### Convert each codebook and store in object

In [9]:
#Read in event codes
eventCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.eventcodes.txt", sep='\t'))
#Read in Goldsteinscale
goldScale = ref_dict(pd.read_csv("./Ref Codes/CAMEO.goldsteinscale.txt", sep='\t'))
#Read in ethnic groups
ethnicCodes =ref_dict(pd.read_csv("./Ref Codes/CAMEO.ethnic.txt", sep='\t'))
#Read in known Groups
knownGroups = ref_dict(pd.read_csv("./Ref Codes/CAMEO.knowngroup.txt", sep='\t'))
#Read in relgion
religionCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.religion.txt", sep='\t'))
#Read in type
typeCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.type.txt", sep='\t'))

eventCodes

{1: 'MAKE PUBLIC STATEMENT',
 10: 'DEMAND',
 11: 'DISAPPROVE',
 12: 'REJECT',
 13: 'THREATEN',
 14: 'PROTEST',
 15: 'EXHIBIT FORCE POSTURE',
 16: 'REDUCE RELATIONS',
 17: 'COERCE',
 18: 'ASSAULT',
 19: 'FIGHT',
 2: 'APPEAL',
 20: 'USE UNCONVENTIONAL MASS VIOLENCE',
 21: 'Appeal for material cooperation, not specified below',
 211: 'Appeal for economic cooperation',
 212: 'Appeal for military cooperation',
 213: 'Appeal for judicial cooperation',
 214: 'Appeal for intelligence',
 22: 'Appeal for diplomatic cooperation, such as policy support',
 23: 'Appeal for aid, not specified below',
 231: 'Appeal for economic aid',
 232: 'Appeal for military aid',
 233: 'Appeal for humanitarian aid',
 234: 'Appeal for military protection or peacekeeping',
 24: 'Appeal for political reform, not specified below',
 241: 'Appeal for change in leadership',
 242: 'Appeal for policy change',
 243: 'Appeal for rights',
 244: 'Appeal for change in institutions, regime',
 25: 'Appeal to yield',
 251: 'Appeal 

In [10]:
# Turn colnames into list for ref

cross_ref = list(colnames)


In [11]:
# Create look up table to get values instead of numbers

look_up_code = {"eventCodes": [26,27,28], "goldScale":[30], "ethnicCodes":[9,19], "knownGroups":[8,18], 
                "religionCodes":[10,11,20,21], "typeCodes":[12,13,14,22,23,24]}

In [12]:
'''
Helper function to user can reorient data based on interest from codes

data: Niger_Data - pandas dataframe
ref: key value from look_look_code - string
codebook: reference 
'''

import math

def search_dict(data,ref, codebook):
    res = {}
    look_up = look_up_code[ref]
    col_names = []
    for i in look_up: 
        col_names.append(cross_ref[i])
    
    for col in col_names: 
        for row in data.iterrows(): 
            if isinstance(row[1][col],float):
                #print (type(row[1][col]), col)
                pass
            else: 
                #print (col)
                var = codebook[row[1][col]].upper()
                #print (var, row[1][col])
                if var in res.keys(): 
                    #print(row[1][col])
                    res[var].append(dict(row[1]))
                else: 
                    res[var] = [dict(row[1])]
    return res
    


In [13]:
res = search_dict(Niger_Data, "ethnicCodes", ethnicCodes)
res.keys()

dict_keys(['INDIGENOUS', 'IGBO', 'MORO', 'FULA', 'TIBETAN'])

In [14]:
#verfication to ensure code is working properly
for k,v in res.items(): 
    print (k, ": ", len(v))

INDIGENOUS :  2
IGBO :  4
MORO :  2
FULA :  2
TIBETAN :  4


In [15]:
#Put each collection of articles in a Dataframe
list_res = []

for cat in res.values(): 
    #print(cat)
    list_res.append(pd.DataFrame(cat))
    

In [16]:
list_res[3] #access the group you are interested in by changing the variables

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20200930,202009,2020,2020.7397,idg,INDIGENOUS,,,idg,,...,10068118,4,"Fadama, Tahoua, Niger",NG,NG06,14.1673,5.2382,10068118,20200930,https://www.thecable.ng/inside-oyo-community-w...
1,20200930,202009,2020,2020.7397,idg,INDIGENOUS,,,idg,,...,10068118,4,"Fadama, Tahoua, Niger",NG,NG06,14.1673,5.2382,10068118,20200930,https://www.thecable.ng/inside-oyo-community-w...


### Homework 4: Do some type of analysis with GDELT data. It can be country focused (e.g. Guatemala) or topic focused (e.g. attacks or bilateral agreements)

### Must write in the first cell what you are interested in. Code must work but results can be garabage. Update the GDELT parameters to get the information you want and then include some type of plot can be a graph or can be a map.  

### Total Points Possible 19
      

In [None]:
## For this exercise I am attempting to plot religions in Brazil on a map

In [62]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html') #Grab GDELT reference list which is by day
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href") #Returns all the possible CSV files of GDELT data as a references list

# separate out those links that begin with four digits 
'''
Will extract just the days resulting in list like: 
['20200617.export.CSV.zip',
 '20200616.export.CSV.zip',
 '20200615.export.CSV.zip',...]
 Until 2015
'''

file_list = [x for x in link_list if str.isdigit(x[0:4])]
file_list

['20201005.export.CSV.zip',
 '20201004.export.CSV.zip',
 '20201003.export.CSV.zip',
 '20201002.export.CSV.zip',
 '20201001.export.CSV.zip',
 '20200930.export.CSV.zip',
 '20200929.export.CSV.zip',
 '20200928.export.CSV.zip',
 '20200927.export.CSV.zip',
 '20200926.export.CSV.zip',
 '20200925.export.CSV.zip',
 '20200924.export.CSV.zip',
 '20200923.export.CSV.zip',
 '20200922.export.CSV.zip',
 '20200921.export.CSV.zip',
 '20200920.export.CSV.zip',
 '20200919.export.CSV.zip',
 '20200918.export.CSV.zip',
 '20200917.export.CSV.zip',
 '20200916.export.CSV.zip',
 '20200915.export.CSV.zip',
 '20200914.export.CSV.zip',
 '20200913.export.CSV.zip',
 '20200912.export.CSV.zip',
 '20200911.export.CSV.zip',
 '20200910.export.CSV.zip',
 '20200909.export.CSV.zip',
 '20200908.export.CSV.zip',
 '20200907.export.CSV.zip',
 '20200906.export.CSV.zip',
 '20200905.export.CSV.zip',
 '20200904.export.CSV.zip',
 '20200903.export.CSV.zip',
 '20200902.export.CSV.zip',
 '20200901.export.CSV.zip',
 '20200831.export.CS

In [63]:
#Counters to help assess how many files are coming and going out
infilecounter = 0
outfilecounter = 0

In [64]:
import os.path #To help navigate the file directories
import urllib #To request from GDELT
import zipfile #TO unzip the files we downlaod
import glob #To go through multiple files in a directory
import operator 

local_path = './results/' # Will save to empy results folder to help keep file clean

fips_country_code = 'BR'  ## !!!!! PIKE: CHANGED COUNTRY CODE TO BRAZIL !!!!

#Adjust list number to get days wanted 
for compressed_file in file_list[:7]: #!!!!!Only getting index 0 to 6!!!!!!
    print(compressed_file,)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,'),
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r', encoding="ISO-8859-1") as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done', infilecounter)

20201005.export.CSV.zip
extracting,
parsing,
done 1
20201004.export.CSV.zip
extracting,
parsing,
done 2
20201003.export.CSV.zip
extracting,
parsing,
done 3
20201002.export.CSV.zip
extracting,
parsing,
done 4
20201001.export.CSV.zip
extracting,
parsing,
done 5
20200930.export.CSV.zip
extracting,
parsing,
done 6
20200929.export.CSV.zip
extracting,
parsing,
done 7


In [68]:
import pandas as pd

# Get the GDELT field names from a helper file
colnames = pd.read_csv('CSV.header.fieldids.csv')['Field Name']


# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+fips_country_code+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID'], encoding='iso-8859-1'))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

./results/BR0000.tsv
./results/BR0001.tsv
./results/BR0002.tsv
./results/BR0003.tsv
./results/BR0004.tsv
./results/BR0005.tsv
./results/BR0006.tsv
./results/BR0007.tsv
./results/BR0008.tsv
./results/BR0009.tsv
./results/BR0010.tsv
./results/BR0011.tsv
./results/BR0012.tsv
./results/BR0013.tsv


In [69]:
import pickle

Brazil_Data = pd.read_pickle(r"./results/backupBR.pickle")

In [70]:
Brazil_Data.head()

Unnamed: 0_level_0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
950194476,20201005,202010,2020,2020.7534,,,,,,,...,BR,1,Brazil,BR,BR,-10,-55,BR,20201005,https://www.msn.com/en-us/lifestyle/parenting/...
950194842,20201005,202010,2020,2020.7534,BRA,BRAZIL,BRA,,,,...,,1,Brazil,BR,BR,-10,-55,BR,20201005,https://www.msn.com/en-us/lifestyle/parenting/...
950194843,20201005,202010,2020,2020.7534,BRA,BRAZIL,BRA,,,,...,,1,Brazil,BR,BR,-10,-55,BR,20201005,https://www.msn.com/en-us/lifestyle/parenting/...
950194844,20201005,202010,2020,2020.7534,BRA,BRAZIL,BRA,,,,...,,1,Brazil,BR,BR,-10,-55,BR,20201005,https://www.msn.com/en-us/lifestyle/parenting/...
950194845,20201005,202010,2020,2020.7534,BRA,BRAZIL,BRA,,,,...,BR,1,Brazil,BR,BR,-10,-55,BR,20201005,https://news.yahoo.com/mcdonalds-among-food-fi...


In [71]:
def ref_dict(df):
    cols = list(df)
    ref_dict = {}
    for row in df.iterrows(): 
        ref_dict[row[1][cols[0]]] = row[1][cols[1]]
    
    return ref_dict

In [72]:
#Read in event codes
eventCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.eventcodes.txt", sep='\t'))
#Read in Goldsteinscale
goldScale = ref_dict(pd.read_csv("./Ref Codes/CAMEO.goldsteinscale.txt", sep='\t'))
#Read in ethnic groups
ethnicCodes =ref_dict(pd.read_csv("./Ref Codes/CAMEO.ethnic.txt", sep='\t'))
#Read in known Groups
knownGroups = ref_dict(pd.read_csv("./Ref Codes/CAMEO.knowngroup.txt", sep='\t'))
#Read in relgion
religionCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.religion.txt", sep='\t'))
#Read in type
typeCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.type.txt", sep='\t'))

eventCodes

{1: 'MAKE PUBLIC STATEMENT',
 10: 'DEMAND',
 11: 'DISAPPROVE',
 12: 'REJECT',
 13: 'THREATEN',
 14: 'PROTEST',
 15: 'EXHIBIT FORCE POSTURE',
 16: 'REDUCE RELATIONS',
 17: 'COERCE',
 18: 'ASSAULT',
 19: 'FIGHT',
 2: 'APPEAL',
 20: 'USE UNCONVENTIONAL MASS VIOLENCE',
 21: 'Appeal for material cooperation, not specified below',
 211: 'Appeal for economic cooperation',
 212: 'Appeal for military cooperation',
 213: 'Appeal for judicial cooperation',
 214: 'Appeal for intelligence',
 22: 'Appeal for diplomatic cooperation, such as policy support',
 23: 'Appeal for aid, not specified below',
 231: 'Appeal for economic aid',
 232: 'Appeal for military aid',
 233: 'Appeal for humanitarian aid',
 234: 'Appeal for military protection or peacekeeping',
 24: 'Appeal for political reform, not specified below',
 241: 'Appeal for change in leadership',
 242: 'Appeal for policy change',
 243: 'Appeal for rights',
 244: 'Appeal for change in institutions, regime',
 25: 'Appeal to yield',
 251: 'Appeal 

In [73]:
# Turn colnames into list for ref

cross_ref = list(colnames)

In [74]:

look_up_code = {"eventCodes": [26,27,28], "goldScale":[30], "ethnicCodes":[9,19], "knownGroups":[8,18], 
                "religionCodes":[10,11,20,21], "typeCodes":[12,13,14,22,23,24]}

In [76]:
'''
Helper function to user can reorient data based on interest from codes

data: Brazil_Data - pandas dataframe
ref: key value from look_look_code - string
codebook: reference 
'''

import math

def search_dict(data,ref, codebook):
    res = {}
    look_up = look_up_code[ref]
    col_names = []
    for i in look_up: 
        col_names.append(cross_ref[i])
    
    for col in col_names: 
        for row in data.iterrows(): 
            if isinstance(row[1][col],float):
                #print (type(row[1][col]), col)
                pass
            else: 
                #print (col)
                var = codebook[row[1][col]].upper()
                #print (var, row[1][col])
                if var in res.keys(): 
                    #print(row[1][col])
                    res[var].append(dict(row[1]))
                else: 
                    res[var] = [dict(row[1])]
    return res


In [79]:
res = search_dict(Brazil_Data, "religionCodes", religionCodes)
res.keys()

##PIKE: Changed the dictionary to reflect religionCodes

dict_keys(['CHRISTIANITY', 'HINDUISM', 'MUSLIM', 'JUDAISM', 'BUDDHISM', 'LATTER DAY SAINTS', 'PROTESTANT', 'CATHOLIC'])

In [80]:
#verfication to ensure code is working properly
for k,v in res.items(): 
    print (k, ": ", len(v))

CHRISTIANITY :  84
HINDUISM :  4
MUSLIM :  6
JUDAISM :  8
BUDDHISM :  4
LATTER DAY SAINTS :  12
PROTESTANT :  12
CATHOLIC :  40


In [81]:
#Put each collection of articles in a Dataframe
list_res = []

for cat in res.values(): 
    #print(cat)
    list_res.append(pd.DataFrame(cat))

In [82]:
list_res[1] # PIKE: Listed Hindu Religion

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20201004,202010,2020,2020.7507,HIN,HINDU,,,,HIN,...,BR,1,Brazil,BR,BR,-10,-55,BR,20201004,https://apnews.com/article/virus-outbreak-aust...
1,20201003,202010,2020,2020.7479,HIN,HINDU,,,,HIN,...,BR,1,Brazil,BR,BR,-10,-55,BR,20201003,https://www.washingtontimes.com/news/2020/oct/...
2,20201004,202010,2020,2020.7507,HIN,HINDU,,,,HIN,...,BR,1,Brazil,BR,BR,-10,-55,BR,20201004,https://apnews.com/article/virus-outbreak-aust...
3,20201003,202010,2020,2020.7479,HIN,HINDU,,,,HIN,...,BR,1,Brazil,BR,BR,-10,-55,BR,20201003,https://www.washingtontimes.com/news/2020/oct/...


In [83]:
list_res[2] #PIKE: Listed Muslim Religion

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20201004,202010,2020,2020.7507,MOS,MUSLIM,,,,MOS,...,BR,1,Brazil,BR,BR,-10.0,-55.0,BR,20201004,https://www.mmamania.com/2020/10/4/21501205/uf...
1,20200930,202009,2020,2020.7397,MOS,ISLAM,,,,MOS,...,,4,"Benfica, Amazonas, Brazil",BR,BR04,-6.765,-70.9364,172350,20200930,https://www.teamtalk.com/news/transfer-gossip-...
2,20201004,202010,2020,2020.7507,MOS,MUSLIM,,,,MOS,...,BR,1,Brazil,BR,BR,-10.0,-55.0,BR,20201004,https://www.mmamania.com/2020/10/4/21501205/uf...
3,20200930,202009,2020,2020.7397,MOS,ISLAM,,,,MOS,...,,4,"Benfica, Amazonas, Brazil",BR,BR04,-6.765,-70.9364,172350,20200930,https://www.teamtalk.com/news/transfer-gossip-...
4,20201004,202010,2020,2020.7507,ARE,UNITED ARAB EMIRATES,ARE,,,,...,BR,1,Brazil,BR,BR,-10.0,-55.0,BR,20201004,https://www.mmamania.com/2020/10/4/21501205/uf...
5,20201004,202010,2020,2020.7507,ARE,UNITED ARAB EMIRATES,ARE,,,,...,BR,1,Brazil,BR,BR,-10.0,-55.0,BR,20201004,https://www.mmamania.com/2020/10/4/21501205/uf...


In [84]:
list_res[3] #PIKE: Listed Jewish Religion

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20201002,202010,2020,2020.7452,JEW,JEWISH,,,,JEW,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
1,20201002,202010,2020,2020.7452,JEW,JEWISH,,,,JEW,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
2,20201002,202010,2020,2020.7452,JEW,JEWISH,,,,JEW,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
3,20201002,202010,2020,2020.7452,JEW,JEWISH,,,,JEW,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
4,20201002,202010,2020,2020.7452,JEW,JEWISH,,,,JEW,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
5,20201002,202010,2020,2020.7452,JEW,JEWISH,,,,JEW,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
6,20201002,202010,2020,2020.7452,CVL,COMMUNITY,,,,,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...
7,20201002,202010,2020,2020.7452,CVL,COMMUNITY,,,,,...,-667833,4,"Bahia, Bahia, Brazil",BR,BR05,-12.9833,-38.5167,-667833,20201002,https://www.heraldtribune.com/story/lifestyle/...


In [85]:
list_res[4] #PIKE: Listed Buddhist Religion

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20200930,202009,2020,2020.7397,BUDMAH223,SOTO,,,,BUD,...,,1,Brazil,BR,BR,-10,-55,BR,20200930,https://hyperallergic.com/590543/clemente-soto...
1,20200930,202009,2020,2020.7397,BUDMAH223,SOTO,,,,BUD,...,BR,1,Brazil,BR,BR,-10,-55,BR,20200930,https://hyperallergic.com/590543/clemente-soto...
2,20200930,202009,2020,2020.7397,BUDMAH223,SOTO,,,,BUD,...,,1,Brazil,BR,BR,-10,-55,BR,20200930,https://hyperallergic.com/590543/clemente-soto...
3,20200930,202009,2020,2020.7397,BUDMAH223,SOTO,,,,BUD,...,BR,1,Brazil,BR,BR,-10,-55,BR,20200930,https://hyperallergic.com/590543/clemente-soto...


In [86]:
import pandas as pd #pandas places ones data in tables format
from bokeh.plotting import figure, output_notebook, show #builds interactive graphs for python
from bokeh.models import Range1d
import math #this is used in graphic section to use the irrational number pi
output_notebook() #Allows inline plotting for Juptyer notebook

In [87]:
from bokeh.tile_providers import get_provider, Vendors
from pyproj import Transformer
tile_provider = get_provider('STAMEN_TERRAIN')

In [88]:
transformer = Transformer.from_crs('epsg:4326','epsg:3857')

In [93]:
country = fips_country_code = 'NG'  

In [95]:
print(flips_country_code = country[["latitude", 'longitude', 'gname']])

#see the data this time first 7 rows
country_map.head(7)

#PIKE: USING GOOGLE TO HELP FIGURE OUT HOW TO DEFINE COUNTRY / MAPS 

TypeError: string indices must be integers

In [96]:
map_dict = {} # empty dictionary to track group attacks by lat long
nan_count = {} # some data doesn't have a lat/long so we need to know what we are losing


# Iterate through tables and associate group with lat/long
for idx, row in brazil_map.iterrows():    
    if row['gname'] in map_dict.keys(): 
        if math.isnan(row["ActiongGeo_Lat"]):
            #This counts no data
            if row['gname'] in nan_count.keys(): 
                nan_count[row['gname']] += 1 
            else: 
                nan_count[row['gname']] = 1
        else: 
            #This has to convert the lat/long to a mercator projection 
            point = transformer.transform(row["ActionGeo_Lat"],row["ActionGeo_Long"])
            map_dict[row['gname']].append([point[0],point[1]]) 
    #BOTH the if an else statement do the same thing but since it is a dictionary one needs to add the group name first
    else:  
        if math.isnan(row["ActionGeo_Lat"]):
          nan_count[row['gname']] = 1
        else: 
            point = transformer.transform(row["ActionGeo_Lat"],row["ActionGeo_Long"])
            map_dict[row['gname']] =[[point[0],point[1]]]
        
#This tells how many attacks we are losing
nan_count    

# PIKE: Running into errors. Attempting to plot the major religions on a map

NameError: name 'brazil_map' is not defined