In [29]:
import numpy as np
from datetime import datetime
import glob
from rtree import index
import math
import pandas as pd

In [30]:
# GDELT column names
col_names = {
    'eventID_name' : 'GLOBALEVENTID',
    'root_code_name' : 'EventRootCode',
    'quad_class_name': 'QuadClass',
    'geo_country_name' : 'SourceActorFull',
    'geo_region_name' : 'ActionGeo_CountryCode',
    'actor_name' : 'Actor1Type1Code',
    'url_name' : 'SOURCEURL',
    'goldstein_name' : 'GoldsteinScale',
    'date_name' : 'SQLDATE',
    'Actor1Code': 'Actor1Type1Code',
    'Actor2Code': 'Actor1Type2Code',
    'Actor3Code': 'Actor1Type3Code'
}

# to save files
path = '/Users/sabine.a.joseph/Desktop/'

# path to folder containing all raw GDELT data files
pathGDELT = '/Users/sabine.a.joseph/Downloads/GDELT_raw/'
all_files = glob.glob(pathGDELT + '*.csv')

df_datestring_column_name = 'SQLDATE'
dateformat = '%Y%m'

# create aggregates: intially for each individual GDELT df
aggregations = {
    'protest' : {'protest_events': 'sum'},
    'material_conflict' : {'material_conflict': 'sum'},
    'rebellion' : {'rebellion_events': 'sum'},
    'GoldsteinScale' : {
    'gs_median': 'median',
    'gs_min': lambda x: min(x),
    'gs_max': lambda x: max(x)},
    'AvgTone' : {
    'at_median': 'median',
    'at_min': lambda x: min(x),
    'at_max': lambda x: max(x)},
    'count_num_daily_events' : {'count_num_daily_events': 'sum'},
    'NumMentions' : {'NumMentions': 'sum'},
    'NumSources' : {'NumSources': 'sum'},
    'NumArticles' : {'NumArticles': 'sum'}
}

# create aggregates: finally for the combined GDELT df
full_GDELT_aggregations = {
    'protest' : {'protest_events': 'sum'},
    'material_conflict' : {'material_conflict': 'sum'},
    'rebellion' : {'rebellion_events': 'sum'},
    'gs_median' : {'gs_median': 'mean'},
    'gs_min' : {'gs_min': 'mean'},
    'gs_max' : {'gs_max': 'mean'},
    'at_median' : {'at_median': 'mean'},
    'at_min' : {'at_min': 'mean'},
    'at_max' : {'at_max': 'mean'},
    'count_num_daily_events' : {'count_num_daily_events': 'sum'},
    'NumMentions' : {'NumMentions': 'sum'},
    'NumSources' : {'NumSources': 'sum'},
    'NumArticles' : {'NumArticles': 'sum'}
}

# geo-level aggregation switch: country vs grid
country_code = 'ActionGeo_CountryCode'
bbox = 'bbox'


In [31]:
def correct_coordinate_format(df, colname_list):
    for i in range(0, len(colname_list)):
        df[colname_list[i]] = [(float(df[colname_list[i]][j][:5])) for j in range (0, len(df[colname_list[i]]))]
    return df

def rtree_index_to_bbox_column(df_lon_col, df_lat_col):    
    idx = index.Index()
    # create rtree index, contains all bounding boxes
    for i in range(0, len(df_grid.id)):
        # if interleaved is True: xmin, ymin, xmax, ymax
        idx.insert(i, (df_grid.xmin[i], df_grid.ymin[i], df_grid.xmax[i], df_grid.ymax[i]))
    
    # retrieve intersection idx for each coordinate pair
    return [(list(idx.intersection((float(df_lon_col[i]), float(df_lat_col[i]), 
                                    float(df_lon_col[i]), float(df_lat_col[i])))))[0]
            if math.isnan(df_lat_col[i]) is False and (list(idx.intersection((float(df_lon_col[i]), float(df_lat_col[i]), 
                                                                          float(df_lon_col[i]), float(df_lat_col[i])))))
            else np.nan for i in range (0, df.shape[0])]

# url and event ID duplicate removal
# create new columns for protest, material conflict, rebellion, radicalism
# cast Goldstein to float
def EoI_columns(df, col_name_dict):
    # max eventid for each url 
    if col_name_dict['url_name'] and col_name_dict['eventID_name'] is not None: 
        gdelt_max_id = df.groupby(col_name_dict['url_name'])[col_name_dict['eventID_name']].max()
        # keep only max ids to remove duplicates
        df = df[df[col_name_dict['eventID_name']].isin(gdelt_max_id)]
    if col_name_dict['root_code_name'] is not None: 
        df['protest'] = np.where(df[col_name_dict['root_code_name']]==14, 1, 0)
    if col_name_dict['quad_class_name'] is not None:
        df['material_conflict'] = np.where(df[col_name_dict['quad_class_name']]==int(4), 1, 0)   
    if col_name_dict['actor_name'] is not None: 
        df['rebellion'] = np.where(df[col_name_dict['actor_name']].isin(['REB','SEP','INS']), 1, 0)
    if col_name_dict['Actor1Code'] and col_name_dict['Actor2Code'] and col_name_dict['Actor3Code'] is not None: 
        df['radicalism'] = np.where(np.logical_or.reduce((df[col_name_dict['Actor1Code']]=='RAD',
                                                          df[col_name_dict['Actor2Code']]=='RAD',
                                                          df[col_name_dict['Actor3Code']]=='RAD')),1, 0)
    if 'goldstein_name' in col_name_dict:
        df['GoldsteinScale'] = df[col_name_dict['goldstein_name']].apply(lambda x : float(x))
    return df

# save raw df
def df_to_csv(df, path, filename):
    df.to_csv(path + filename)

def str_to_datetime(col_name, dateformat):
    return [datetime.strptime(str(df[col_name][i]), dateformat) for i in range(0, df.shape[0]) if i is not None] 

# aggregate per country / bbox and month
# index needs to be datetime
# enter country_col_name as geo-switch: takes country code or bbox
def agg_by_geo_by_month(df, agg_dict, country_col_name):
    agg_df = df.groupby([df.index, country_col_name]).agg(agg_dict)
    agg_df = agg_df.reset_index()
    agg_df.columns = agg_df.columns.get_level_values(0)
    return agg_df


In [32]:
# get and format gridcell data
df_grid = pd.read_csv('/Users/sabine.a.joseph/Desktop/Gridcells_with_countryinfo.csv', sep = ';')
df_grid = correct_coordinate_format(df_grid, ['xmin', 'xmax', 'ymin', 'ymax'])
    
for file in all_files:
    print file
    df = pd.read_csv(file)
    df['bbox'] = rtree_index_to_bbox_column(df.Actor1Geo_Long, df.Actor1Geo_Lat)
    
    df.QuadClass = [int(df.QuadClass[i]) for i in range(0, df.shape[0])]
    df.EventRootCode = [int(df.EventRootCode[i]) for i in range(0, df.shape[0])]
    df = EoI_columns(df, col_names)
    
    # save raw data enriched with bbox labels
    csv_name = 'GDELT_1Mo' + file[49:70]
    df_to_csv(df, path, csv_name) 

    # date column to datetime index
    df = df.reset_index(drop=True)
    df.SQLDATE = [str(df.SQLDATE[i])[:-2] for i in range (0, len(df.SQLDATE)) if i is not None]

    df[df_datestring_column_name] = str_to_datetime(df_datestring_column_name, dateformat)
    df.index = df[df_datestring_column_name]

    df['count_num_daily_events'] = 1 
    agg_df = agg_by_geo_by_month(df, aggregations, country_code) # or 'bbox' for grid level aggregation

    # rename columns
    agg_df.columns = ['SQLDATE', country_code, 'material_conflict', 'protest', 'gs_median', 'gs_min', 
                      'gs_max', 'NumSources', 'NumMentions', 'at_median', 'at_min', 'at_max', 
                      'count_num_daily_events', 'NumArticles', 'rebellion']

    # save df to csv
    if bbox in agg_df.columns:
        csv_name = 'GDELT_agg/GDELT_1Mo_agg_subset_BBOX' + file[49:70]
    else:
        csv_name = 'GDELT_agg/GDELT_1Mo_agg_subset_COUNTRY' + file[49:70]
    df_to_csv(agg_df, path, csv_name) 

/Users/sabine.a.joseph/Downloads/GDELT_raw/gdelt_20140101_20140131.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

/Users/sabine.a.joseph/Downloads/GDELT_raw/gdelt_20140201_20140228.csv
/Users/sabine.a.joseph/Downloads/GDELT_raw/gdelt_20140301_20140331.csv


In [91]:
# path to folder containing aggregated raw GDELT data files
pathGDELT_agg = '/Users/sabine.a.joseph/Desktop/GDELT_agg/'
all_agg_files = glob.glob(pathGDELT_agg + '*.csv')

def concat_dfs(pathGDELT_agg, all_agg_files):
    for i in range(0, len(all_agg_files)):
        if i == 0: #create initial df on first loop iteration
            df = pd.read_csv(all_agg_files[i])
        else: #concatenate df on each iteration
            df = pd.concat([df, pd.read_csv(all_agg_files[i])]) 

    return df.reset_index(drop = True)

full_GDELT_df = concat_dfs(pathGDELT_agg, all_agg_files)
del full_GDELT_df['Unnamed: 0']

full_agg_GDELT_df = agg_by_geo_by_month(full_GDELT_df, full_GDELT_aggregations, 'ActionGeo_CountryCode')
df_to_csv(full_agg_GDELT_df, path, 'full_GDELT.csv')