In [308]:
import pandas as pd
import numpy as np
from datetime import datetime
import datetime as dt

In [363]:
# global variables: country codes and grid index
CC3 = ['KWT', 'BHR', 'OMN', 'QAT', 'SAU', 'ARE', 'YEM', 'ISR', 'PSE', 'JOR', 'LBN', 'SYR',
       'EGY', 'IRN', 'TUR', 'IRQ'] #Phoenix
CC2 = ['KW', 'BH', 'OM', 'QA', 'SA', 'AE', 'YE', 'IL', 'PS', 'JO', 'LB', 'SY', 'EG', 'IR', 
       'TR', 'IQ']
CCS = [690, 692, 698, 694, 670, 696, 680, 666, 'NaN', 663, 660, 652, 651, 630, 640, 645] #UCDP
FIPS = ['KU', 'BA', 'MU', 'QA', 'SA', 'AE', 'YM', 'IS', 'NaN', 'JO', 'LE', 'SY', 'EG', 'IR', 
        'TU', 'IZ'] #GDELT
bbox = range(1,785)

#load all aggregated csvs: UCDP, GDELT, Phoenix
path = '/Users/sabine.a.joseph/Desktop/'
geo_level = 'country' #'bbox'

def csv_to_df(path, filename):
    df = pd.read_csv(path + filename, sep = ',', low_memory=False)
    df = df.reset_index(drop=True)
    return df

if geo_level is 'country':
    df_UCDP = csv_to_df(path, 'UCDP_NaMo_agg_subset_COUNTRY.csv')
    df_Phoe = csv_to_df(path, 'Phoenix_NaMo_agg_subset_COUNTRY.csv')
    df_GDELT = csv_to_df(path, 'full_GDELT.csv')
else: 
    df_UCDP = csv_to_df(path, 'UCDP_NaMo_agg_subset_BBOX.csv')
    df_Phoe = csv_to_df(path, 'Phoenix_NaMo_agg_subset_BBOX.csv')
    #df_GDELT = csv_to_df(path, 'full_GDELT_BBOX.csv')
    
del df_UCDP['Unnamed: 0']
del df_Phoe['Unnamed: 0']

# create empty df
date_format = '%Y-%m-%d'
start_date = '1990-01-01 00:00:00'
end_date = '2017-08-01 00:00:00'
CC_col_names = ['CC3', 'CCS', 'CC2', 'FIPS']
CC_lists = [CC3, CCS, CC2, FIPS]

# geo-level switch: country vs. grid (bbox)
geo_level = len(CCS) #len(bbox) 

# date-freq switch: monthly vs yearly
date_freq = 'MS' #'YS'

def create_empty_df(date_format, start_date, end_date, CC_col_names, CC_lists, geo_level, date_freq, bbox):

    df = pd.DataFrame(index=pd.date_range(start = start_date, end = end_date, freq=date_freq))
    df.index = [df.index[i].strftime(new_format) for i in range(0, len(df.index)) if i is not None]
    df.reset_index(inplace=True)
    df.columns = ['Date']
    df = pd.concat([df]*geo_level, ignore_index=True)
    df = df.sort_values(by = 'Date')
    df.reset_index(inplace=True, drop=True)
    df.shape

    if geo_level is 16: # country level
        for i in range(0, len(CC_lists)):
            temp = CC_lists[i] * (len(df.index)/geo_level)
            df[CC_col_names[i]] = temp
            df.is_copy = False
    else: # bbox level 
        temp = bbox * (len(df.index)/geo_level)
        df['bbox'] = temp
        df.is_copy = False
        
    return df

df = create_empty_df(date_format, start_date, end_date, CC_col_names, CC_lists, geo_level, date_freq, bbox)

# create empty columns to extend combined df based on columns
cols_to_append_UCDP = ['UCDP_count_num_daily_events', 'UCDP_deaths_civilians', 'UCDP_number_of_sources',
                       'UCDP_deaths_unknown', 'UCDP_best_est', 'UCDP_high_est','UCDP_low_est', 'UCDP_deaths_a', 
                       'UCDP_deaths_b'] 
cols_to_append_Phoe = ['Phoe_material_conflict', 'Phoe_protest', 'Phoe_gs_median', 'Phoe_gs_min', 'Phoe_gs_max',
                       'Phoe_count_num_daily_events', 'Phoe_rebellion'] 
cols_to_append_GDELT = ['GDELT_material_conflict', 'GDELT_protest', 'GDELT_gs_median', 'GDELT_gs_min', 
                        'GDELT_gs_max', 'GDELT_NumSources', 'GDELT_NumMentions', 'GDELT_at_median', 
                        'GDELT_at_min', 'GDELT_at_max', 'GDELT_count_num_daily_events', 'GDELT_NumArticles',
                        'GDELT_rebellion'] 

cols_to_append_UCDP.extend(cols_to_append_Phoe)
cols_to_append_UCDP.extend(cols_to_append_GDELT)
cols_to_append = cols_to_append_UCDP

def append_empty_cols_to_df(df, col_names):
    for i in col_names:
        df[i] = np.nan
    return df
        
df = append_empty_cols_to_df(df, cols_to_append_UCDP)
df.rename(columns={'Date': 'date_start'}, inplace=True)

# rename columns 
df_UCDP.columns = ['date_start', 'CCS', 'UCDP_low_est', 'UCDP_count_num_daily_events', 'UCDP_high_est', 
                   'UCDP_deaths_civilians', 'UCDP_number_of_sources', 'UCDP_deaths_unknown', 
                  'UCDP_deaths_a', 'UCDP_deaths_b', 'UCDP_best_est']
df_Phoe.columns = ['date_start', 'CC3', 'Phoe_material_conflict', 'Phoe_protest', 'Phoe_gs_median',
                   'Phoe_gs_min', 'Phoe_gs_max', 'Phoe_count_num_daily_events', 'Phoe_rebellion']
df_GDELT.columns = ['date_start', 'FIPS', 'GDELT_material_conflict', 'GDELT_protest', 'GDELT_gs_median', 
                    'GDELT_gs_min', 'GDELT_gs_max', 'GDELT_NumSources', 'GDELT_NumMentions',
                    'GDELT_at_median', 'GDELT_at_min', 'GDELT_at_max', 'GDELT_count_num_daily_events',
                    'GDELT_NumArticles', 'GDELT_rebellion']

In [365]:
def combine_empty_and_filled_df(df, df_agg, CC_col_names, CC_lists):
    column_names = df_agg.columns
    
    for i in range(0, df.shape[0]):
        match_against = (df_agg.date_start == str(df.date_start[i]))
        
        if CC_col_names[1] in column_names and df.CC3[i] != 'PSE': # UCDP
            df_temp = df_agg[match_against & (df_agg[CC_col_names[1]] == int(df[CC_col_names[1]][i]))] 
            
        elif CC_col_names[0] in column_names and df.CC3[i] != 'PSE': # Phoenix
            df_temp = df_agg[match_against & (df_agg[CC_col_names[0]] == df[CC_col_names[0]][i])]
            
        elif CC_col_names[3] in column_names: # GDELT
            df_temp = df_agg[match_against & (df_agg[CC_col_names[3]] == df[CC_col_names[3]][i])]
                
        if df_temp.empty is False:
            for k in column_names:
                df.set_value(i, k, df_temp.iloc[0][k] )
                df.is_copy = False
    return df

combined_df = combine_empty_and_filled_df(df, df_UCDP, CC_col_names, CC_lists)
combined_df = combine_empty_and_filled_df(combined_df, df_Phoe, CC_col_names, CC_lists)
combined_df = combine_empty_and_filled_df(combined_df, df_GDELT, CC_col_names, CC_lists)


In [366]:
def df_to_csv(df, path, filename):
    df.to_csv(path + filename)

csv_name = 'combined_df.csv'
df_to_csv(combined_df, path, csv_name) 