In [165]:
import pandas as pd
import numpy as np
from datetime import datetime
import glob
from fiona import collection
from rtree import index

In [166]:
# read excel with country names and codes
# return list with country codes relevant for UCDP
country_code_pathfile = '/Users/sabine.a.joseph/Desktop/Country_codes_NAMO.xlsx'
country_code_column_name = 'UCDP_country_codes'
country_code_column_name1 = 'Country_3'
sheet = 'Sheet1'

def country_codes_from_excel(country_codes, sheet_num, column_name):
    countries = pd.ExcelFile(country_codes)
    countries = countries.parse(sheet_num)
    return list(countries[column_name]) # UCDP uses Gleditsch and Ward country codes

CCS = country_codes_from_excel(country_code_pathfile, sheet, country_code_column_name)
CC3 = country_codes_from_excel(country_code_pathfile, sheet, country_code_column_name1)

In [167]:
UCDP_filename = '/Users/sabine.a.joseph/Downloads/ged50-csv/ged50.csv'
country_code_column_name = 'gwno'

def csv_to_df(filename, country_codes, csv_country_code_column):
    df = pd.read_csv(filename , sep=',')
    return df[df[csv_country_code_column].isin(country_codes)]

df = csv_to_df(UCDP_filename, CCS, country_code_column_name)
df = df.reset_index()

In [168]:
# get and format gridcell data
df_grid = pd.read_csv('/Users/sabine.a.joseph/Desktop/Gridcells_with_countryinfo.csv', sep = ';')

def correct_coordinate_format(df, colname_list):
    for i in range(0, len(colname_list)):
        df[colname_list[i]] = [(float(df[colname_list[i]][j][:5])) for j in range (0, len(df[colname_list[i]]))]
    return df

df_grid = correct_coordinate_format(df_grid, ['xmin', 'xmax', 'ymin', 'ymax'])
#df_grid.head()

In [169]:
def rtree_index_to_bbox_column(df_lon_col, df_lat_col):    
    idx = index.Index()
    # create rtree index, contains all bounding boxes
    for i in range(0, len(df_grid.id)):
        # if interleaved is True: xmin, ymin, xmax, ymax
        idx.insert(i, (df_grid.xmin[i], df_grid.ymin[i], df_grid.xmax[i], df_grid.ymax[i]))
    
    # retrieve intersection idx for each coordinate pair
    return [(list(idx.intersection((float(df_lon_col[i]), float(df_lat_col[i]), 
                                    float(df_lon_col[i]), float(df_lat_col[i])))))[0]
            if (list(idx.intersection((float(df_lon_col[i]), float(df_lat_col[i]), 
                                    float(df_lon_col[i]), float(df_lat_col[i])))))
            else np.nan for i in range (0, df.shape[0])]

df['bbox'] = rtree_index_to_bbox_column(df.longitude, df.latitude)

In [170]:
# save raw df
def df_to_csv(df, path, filename):
    df.to_csv(path + filename)

# example input and call
path = '/Users/sabine.a.joseph/Desktop/'
csv_name = 'UCDP_NaMo_subset.csv'
df_to_csv(df, path, csv_name) 

In [171]:
# date column to datetime index
df = df.reset_index(drop=True)
df.date_start = [str(df.date_start[i])[:-3] for i in range (0, len(df.date_start)) if i is not None]

def str_to_datetime(col_name, dateformat):
    return [datetime.strptime(str(df[col_name][i]), dateformat) for i in range(0, df.shape[0]) if i is not None]

df_datestring_column_name = 'date_start'
dateformat = '%Y-%m'
df[df_datestring_column_name] = str_to_datetime(df_datestring_column_name, dateformat)
df.index = df[df_datestring_column_name]

In [175]:
# aggregate per country / bbox and month
# index needs to be datetime
# enter country_col_name as geo-switch: takes country code or bbox
def agg_by_geo_by_month(df, agg_dict, country_col_name):
    agg_df = df.groupby([df.index, country_col_name]).agg(aggregations)
    agg_df = agg_df.reset_index()
    agg_df.columns = agg_df.columns.get_level_values(0)
    return agg_df
    
df['count_num_daily_events'] = 1 

# create aggregates
aggregations = {
    'number_of_sources' : {'number_of_sources': 'sum'},
    'deaths_b' : {'deaths_b': 'sum'},
    'deaths_a' : {'deaths_a': 'sum'},
    'deaths_civilians' : {'deaths_civilians': 'sum'},
    'deaths_unknown' : {'deaths_unknown': 'sum'},
    'best_est' : {'best_est_events': 'sum'},
    'high_est' : {'high_est_events': 'sum'},
    'low_est' : {'low_est_events': 'sum'},
    'count_num_daily_events' : {'count_num_daily_events': 'sum'}
}

# geo-level aggregation switch: country vs grid
# geo-level aggregation switch: country vs grid
country_code = 'gwno'
bbox = 'bbox'

agg_df = agg_by_geo_by_month(df, aggregations, country_code) # or 'bbox' for grid level aggregation


In [176]:
# save agg df 2x
def df_to_csv(df, path, filename):
    df.to_csv(path + filename)

path = '/Users/sabine.a.joseph/Desktop/'

if bbox in agg_df.columns:
    csv_name = 'UCDP_NaMo_agg_subset_BBOX.csv'
else:
    csv_name = 'UCDP_NaMo_agg_subset_COUNTRY.csv'

df_to_csv(agg_df, path, csv_name) 

In [None]:
# filter by type of crisis --- check with Oliver
# type_of_violence # 1: state-based conflict # 2: non-state conflict # 3: one-sided violence
df = df[df['type_of_violence'].isin([1, 3])]


# df = df[df.date_prec > 1] 

# ? conflict_new_id #as filter? -- existing conflict -- ask Oliver
#df.loc[(df['conflict_new_id'] == 426) & df['type_of_violence'].isin([1, 3]))]
#df.loc[(df['type_of_violence'].isin([426, 234]) & df['type_of_violence'].isin([1, 3]))]

# filter by event_clarity?

### to use:
# date var: 
# combination of: date_start, date_end
# date_prec #duration of event, only chose date_prec = 1 (verify with Oliver?)
# df = df.loc[(df['date_prec'] == 1)]

# -> date_start = date_end - chose as event date
# or always chose date_start as date
# del df.date_end

# number_of_sources #importance - only valid for yr 2013 and 2014
# del df.event_clarity


In [117]:
# get date range for all dates
all_days = pd.date_range(df_agg.index.get_level_values('date_start').min(), 
                         df_agg.index.get_level_values('date_start').max(), freq='D')


#df_agg.index.get_level_values('date_start')
#df_agg.index.get_level_values('gwno')

#df.reindex(all_days)

# fill with NaNs for missing dates (days) and country codes
# so that we can merge with data from other DBs

#df.loc[all_days]

#repeat for gwno