In [1]:
import urllib
import lxml.html
import pandas as pd
import os
import zipfile
import numpy as np
from datetime import datetime

In [3]:
# read excel with country names and codes
# return list with country codes relevant for UCDP
country_code_pathfile = '/Users/sabinejoseph/Downloads/kfe-sabinejo-patch-1/Input/Country_codes_NAMO.xlsx'
country_code_column_name = 'Country_3'
sheet = 'Sheet1'

def country_codes_from_excel(country_codes, sheet_num, column_name):
    countries = pd.ExcelFile(country_codes)
    countries = countries.parse(sheet_num)
    return list(countries[column_name]) # UCDP uses Gleditsch and Ward country codes

CC3 = country_codes_from_excel(country_code_pathfile, sheet, country_code_column_name)

In [4]:
url = 'http://phoenixdata.org/data'

def connect_to_url_get_links(url):
    connection = urllib.urlopen(url)
    dom =  lxml.html.fromstring(connection.read())

    links = []
    for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
        links.append(link) #all download links in list
    del links[0:4] 
    return links

links = connect_to_url_get_links(url)

In [5]:
# downloads all files in dir #lengthy!!!
DOWNLOADS_DIR = '/Users/sabinejoseph/Downloads/kfe-sabinejo-patch-1/Phoenix_data'

def download_and_unzip_files(down_dir, links):
    # For every line in the file
    for url in links:
        # Split on the rightmost / and take everything on the right side of that
        name = url.rsplit('/', 1)[-1]

        # Combine the name and the downloads directory to get the local filename
        filename = os.path.join(down_dir, name)

        # Download the file if it does not exist
        if not os.path.isfile(filename):
            urllib.urlretrieve(url, filename)

    # unzip all files
    filenames = []
    for filename in os.listdir(down_dir):
        if filename.endswith(".txt.zip"): 
            filenames.append(filename)
            with zipfile.ZipFile(down_dir + '/' + filename) as zip_ref:
                zip_ref.extractall(down_dir)
                                                                                                                                                                                                                                                                                                                                                                                                                             
download_and_unzip_files(DOWNLOADS_DIR, links)

In [10]:
# load all data to df, for relevant region #lengthy!!!
col_names = ('EventID', 'Date', 'Year', 'Month', 'Day', 'SourceActorFull', 'SourceActorEntity', 
             'SourceActorRole', 'SourceActorAttribute', 'TargetActorFull', 'TargetActorEntity', 
             'TargetActorRole', 'TargetActorAttribute', 'EventCode', 'EventRootCode', 
             'PentaClass', 'GoldsteinScore', 'Issues', 'Lat', 'Lon', 'LocationName', 
             'StateName', 'CountryCode', 'SentenceID', 'URLs', 'NewsSources')
country_code_filter_col_name = 'SourceActorFull'

def data_to_df(down_dir, col_names, country_codes, filter_col):
    filenames = []
    for filename in os.listdir(down_dir):
        if filename.endswith(".txt.zip"): 
            filenames.append(filename)
            
    for i in range(0, len(filenames)):
        if i == 0: #create initial df on first loop iteration
            df = pd.read_table(down_dir + '/' + filenames[i][:-4], delim_whitespace=False, 
                               names=col_names)
        else: #concatenate df on each iteration
            df = pd.concat([df, pd.read_table(down_dir + '/' + filenames[i][:-4], delim_whitespace=False, 
                               names = col_names)]) 
            df = df[df[filter_col].isin(country_codes)] 

    df = df[df[filter_col].isin(country_codes)]  
    df = df.reset_index(drop = True)
    return df

df = data_to_df(DOWNLOADS_DIR, col_names, CC3, country_code_filter_col_name)


In [11]:
# save df as csv
path = '/Users/sabinejoseph/Downloads/kfe-sabinejo-patch-1/'
csv_name = 'Phoenix_NaMo_subset.csv'

def df_to_csv(path, filename):
    df.to_csv(path + filename)
    
df_to_csv(path, csv_name)   

In [12]:
#### if all already downloaded (previous steps), use the csv
def csv_to_df(path, filename):
    df = pd.read_csv(path + filename, sep = ',', low_memory=False)
    df = df.reset_index(drop=True)
    return df

df = csv_to_df(path, csv_name)

In [13]:
# delete not to be used columns
# 'id' # only keep for data cleaning 
vars_to_del = ['EventID', 'Year', 'Month', 'Day', 'SourceActorEntity',
           'SourceActorRole', 'SourceActorAttribute', 'TargetActorEntity', 'TargetActorRole', 
           'TargetActorAttribute', 'Issues', 'Lat', 'Lon', 'LocationName', 'StateName', 'CountryCode',
           'SentenceID', 'URLs']
  
def del_columns_from_df(col_names):
    for i in col_names:
        del df[i]
    return df

df = del_columns_from_df(vars_to_del)

In [14]:
df = df.reset_index(drop=True)
df.Date = [str(df.Date[i])[:-2] for i in range (0, len(df.Date)) if i is not None]

df_datestring_column_name = 'Date'
dateformat = '%Y%m%d'

def str_to_datetime(col_name, dateformat):
    return [datetime.strptime(str(df[col_name][i]), dateformat) 
            for i in range(0, len(df[col_name])) if i is not None]

df[df_datestring_column_name] = str_to_datetime(df_datestring_column_name, dateformat)


In [None]:
# filter type of crisis 
# TargetActorFull # 1: state-based conflict # 2: non-state conflict # 3: one-sided violence
df = df[df['PentaClass'].isin([1, 4])]

# TargetActorEntity
# EventCode
# EventRootCode
# NewsSources #sum individual newspapers 1 or two or more, count semicolons


In [68]:
# grouping by date per country code
new_format = '%Y-%m-%d'
col_name_date = 'Date'
col_name_country_codes = 'SourceActorFull'
agg_col_names = ['GoldsteinScore']
event_count_col_name = 'count_num_daily_events'

def group_by_country_code_date_agg_sum(date, CC, col_name_list, ct_col_name, funct):
# sum of death counts
# count of events per day per country code
    df[ct_col_name] = 1 
    col_name_list.append(ct_col_name)
    return df.groupby([date, CC]).agg(dict.fromkeys(col_name_list, funct))
    # np.nanmedian
    
df_agg = group_by_country_code_date_agg_sum(col_name_date, col_name_country_codes, agg_col_names, event_count_col_name, sum).reset_index()
df_agg.date_start = [df_agg[col_name_date][i].strftime(new_format) for i in range(0, len(df_agg.index)) if i is not None]

df_agg_GS = group_by_country_code_date_agg_sum(col_name_date, col_name_country_codes, agg_col_names, event_count_col_name, np.nanmedian).reset_index()
df_agg.GoldsteinScore = df_agg_GS.GoldsteinScore

In [70]:
csv_name = 'Phoenix_NaMo_Agg_subset.csv'
df_agg.to_csv(path + csv_name)

In [109]:
# add column: event 0 - 1
# relations between columns

#df['TargetActorRole2'] = pd.factorize(df['TargetActorRole'])[0]
#df['SourceActorFull2'] = pd.factorize(df['SourceActorFull'])[0]

#df[['EventCode','EventRootCode']] = df[['EventCode','EventRootCode']].apply(pd.to_numeric, errors='ignore')
