In [311]:
import urllib
import lxml.html
import pandas as pd
import os
import zipfile
import numpy as np
from datetime import datetime
import glob
from rtree import index
import math

In [312]:
# read excel with country names and codes
# return list with country codes relevant for UCDP
country_code_pathfile = '/Users/sabine.a.joseph/Desktop/Country_codes_NAMO.xlsx'
country_code_column_name = 'Country_3'
sheet = 'Sheet1'

def country_codes_from_excel(country_codes, sheet_num, column_name):
    countries = pd.ExcelFile(country_codes)
    countries = countries.parse(sheet_num)
    return list(countries[column_name]) # UCDP uses Gleditsch and Ward country codes

CC3 = country_codes_from_excel(country_code_pathfile, sheet, country_code_column_name)

In [313]:
url = 'http://phoenixdata.org/data'

def connect_to_url_get_links(url):
    connection = urllib.urlopen(url)
    dom =  lxml.html.fromstring(connection.read())

    links = []
    for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
        links.append(link) #all download links in list
    del links[0:4] 
    return links

links = connect_to_url_get_links(url)

In [314]:
# downloads all files in dir #lengthy!!!
DOWNLOADS_DIR = '/Users/sabine.a.joseph/Documents/sabine.a.joseph/Documents/Phoenix_event_data'

def download_and_unzip_files(down_dir, links):
    # For every line in the file
    for url in links:
        # Split on the rightmost / and take everything on the right side of that
        name = url.rsplit('/', 1)[-1]

        # Combine the name and the downloads directory to get the local filename
        filename = os.path.join(down_dir, name)

        # Download the file if it does not exist
        if not os.path.isfile(filename):
            urllib.urlretrieve(url, filename)

    # unzip all files
    filenames = []
    for filename in os.listdir(down_dir):
        if filename.endswith(".txt.zip"): 
            filenames.append(filename)
            with zipfile.ZipFile(down_dir + '/' + filename) as zip_ref:
                zip_ref.extractall(down_dir)
    return filenames
                
filenames = download_and_unzip_files(DOWNLOADS_DIR, links)

In [315]:
# load all data to df, for relevant country #lengthy!!!
col_names = ('EventID', 'Date', 'Year', 'Month', 'Day', 'SourceActorFull', 'SourceActorEntity', 
             'SourceActorRole', 'SourceActorAttribute', 'TargetActorFull', 'TargetActorEntity', 
             'TargetActorRole', 'TargetActorAttribute', 'EventCode', 'EventRootCode', 
             'PentaClass', 'GoldsteinScore', 'Issues', 'Lat', 'Lon', 'LocationName', 
             'StateName', 'CountryCode', 'SentenceID', 'URLs', 'NewsSources')
country_code_filter_col_name = 'SourceActorFull'

def data_to_df(down_dir, col_names, country_codes, filter_col):
    for i in range(0, len(filenames)):
        if i == 0: #create initial df on first loop iteration
            df = pd.read_table(down_dir + '/' + filenames[i][:-4], delim_whitespace=False, 
                               names=col_names)
        else: #concatenate df on each iteration
            df = pd.concat([df, pd.read_table(down_dir + '/' + filenames[i][:-4], delim_whitespace=False, 
                               names = col_names)]) 
            df = df[df[filter_col].isin(country_codes)] 

    df = df[df[filter_col].isin(country_codes)]  
    df = df.reset_index(drop = True)
    return df

df = data_to_df(DOWNLOADS_DIR, col_names, CC3, country_code_filter_col_name)


In [316]:
# get and format gridcell data
df_grid = pd.read_csv('/Users/sabine.a.joseph/Desktop/Gridcells_with_countryinfo.csv', sep = ';')

def correct_coordinate_format(df, colname_list):
    for i in range(0, len(colname_list)):
        df[colname_list[i]] = [(float(df[colname_list[i]][j][:5])) for j in range (0, len(df[colname_list[i]]))]
    return df

df_grid = correct_coordinate_format(df_grid, ['xmin', 'xmax', 'ymin', 'ymax'])

In [317]:
def rtree_index_to_bbox_column(df_lon_col, df_lat_col):    
    idx = index.Index()
    # create rtree index, contains all bounding boxes
    for i in range(0, len(df_grid.id)):
        # if interleaved is True: xmin, ymin, xmax, ymax
        idx.insert(i, (df_grid.xmin[i], df_grid.ymin[i], df_grid.xmax[i], df_grid.ymax[i]))
    
    # retrieve intersection idx for each coordinate pair
    return [(list(idx.intersection((float(df_lon_col[i]), float(df_lat_col[i]), 
                                    float(df_lon_col[i]), float(df_lat_col[i])))))[0]
            if math.isnan(df_lat_col[i]) is False and (list(idx.intersection((float(df_lon_col[i]), float(df_lat_col[i]), 
                                                                          float(df_lon_col[i]), float(df_lat_col[i])))))
            else np.nan for i in range (0, df.shape[0])]

df['bbox'] = rtree_index_to_bbox_column(df.Lon, df.Lat)

In [323]:
# url and event ID duplicate removal
# create new columns for protest, material conflict, rebellion, radicalism
# cast Goldstein to float
def EoI_columns(df, col_name_dict):
    # max eventid for each url 
    if col_name_dict['url_name'] and col_name_dict['eventID_name'] is not None: 
        gdelt_max_id = df.groupby(col_name_dict['url_name'])[col_name_dict['eventID_name']].max()
        # keep only max ids to remove duplicates
        df = df[df[col_name_dict['eventID_name']].isin(gdelt_max_id)]
        #df = df.reset_index()
    if col_name_dict['root_code_name'] is not None: 
        df['protest'] = np.where(df[col_name_dict['root_code_name']]=='14', 1, 0)
    if col_name_dict['quad_class_name'] is not None:
        df['material_conflict'] = np.where(df[col_name_dict['quad_class_name']]==4, 1, 0)
    if col_name_dict['actor_name'] is not None: 
        df['rebellion'] = np.where(df[col_name_dict['actor_name']].isin(['REB','SEP','INS']), 1, 0)
    if col_name_dict['Actor1Code'] and col_name_dict['Actor2Code'] and col_name_dict['Actor3Code'] is not None: 
        df['radicalism'] = np.where(np.logical_or.reduce((df['Actor1Code']=='RAD',
                                                          df['Actor2Code']=='RAD',
                                                          df['Actor3Code']=='RAD')),1, 0)
    if 'goldstein_name' in col_name_dict:
        df['GoldsteinScale'] = df[col_name_dict['goldstein_name']].apply(lambda x : float(x))
    return df

# Phoenix column names
col_names = {
    'eventID_name' : 'EventID',
    'root_code_name' : 'EventRootCode',
    'quad_class_name': 'PentaClass',
    'geo_country_name' : 'SourceActorFull',
    'geo_region_name' : 'region',
    'actor_name' : 'TargetActorRole',
    'url_name' : 'URLs',
    'goldstein_name' : 'GoldsteinScore',
    'date_name' : 'Date',
    'Actor1Code': None,
    'Actor2Code': None,
    'Actor3Code': None
}

df = EoI_columns(df, col_names)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/i

In [324]:
# save raw df
def df_to_csv(df, path, filename):
    df.to_csv(path + filename)

# example input and call
path = '/Users/sabine.a.joseph/Desktop/'
csv_name = 'Phoenix_NaMo_subset.csv'
df_to_csv(df, path, csv_name) 

In [327]:
df = pd.read_csv(path + csv_name, low_memory=False)

In [328]:
# date column to datetime index
df = df.reset_index(drop=True)
df.Date = [str(df.Date[i])[:-2] for i in range (0, len(df.Date)) if i is not None]

def str_to_datetime(col_name, dateformat):
    return [datetime.strptime(str(df[col_name][i]), dateformat) for i in range(0, df.shape[0]) if i is not None]

df_datestring_column_name = 'Date'
dateformat = '%Y%m'
df[df_datestring_column_name] = str_to_datetime(df_datestring_column_name, dateformat)
df.index = df[df_datestring_column_name]

In [333]:
# aggregate per country / bbox and month
# index needs to be datetime
# enter country_col_name as geo-switch: takes country code or bbox
def agg_by_geo_by_month(df, agg_dict, country_col_name):
    agg_df = df.groupby([df.index, country_col_name]).agg(agg_dict)
    agg_df = agg_df.reset_index()
    agg_df.columns = agg_df.columns.get_level_values(0)
    return agg_df
    
df['count_num_daily_events'] = 1 

# create aggregates
aggregations = {
    'protest' : {'protest_events': 'sum'},
    'material_conflict' : {'material_conflict_events': 'sum'},
    'rebellion' : {'rebellion_events': 'sum'},
    'GoldsteinScale' : {
    'gs_median': 'median',
    'gs_min': lambda x: min(x),
    'gs_max': lambda x: max(x)},
    'count_num_daily_events' : {'count_num_daily_events': 'sum'}
}

agg_df = agg_by_geo_by_month(df, aggregations, 'SourceActorFull') # or 'bbox' for grid level aggregation

# geo-level aggregation switch: country vs grid
country_code = 'SourceActorFull'
bbox = 'bbox'

agg_df = agg_by_geo_by_month(df, aggregations, country_code) # or 'bbox' for grid level aggregation
# rename columns
agg_df.columns = ['Date', country_code, 'material_conflict', 'protest', 'gs_median', 'gs_min', 
                  'gs_max', 'count_num_daily_events', 'rebellion']


In [334]:
# save agg df 2x
def df_to_csv(df, path, filename):
    df.to_csv(path + filename)

path = '/Users/sabine.a.joseph/Desktop/'

if bbox in agg_df.columns:
    csv_name = 'Phoenix_NaMo_agg_subset_BBOX.csv'
else:
    csv_name = 'Phoenix_NaMo_agg_subset_COUNTRY.csv'

df_to_csv(agg_df, path, csv_name) 