In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import csv
import copy
import sklearn.linear_model
import json
import re
from sklearn import preprocessing
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
COLOR_TREAT = "#2ecc71"
COLOR_NO_TREAT = "#e74c3c"
import warnings
warnings.filterwarnings('ignore')

In [None]:
pa = pd.read_excel('data/ucdp-peace-agreements.xls')

In order to prepare the table that will be used to display the peace agreements on our map, we clean the peace agreement dataset. First we chose to keep specific features in the table. 

**Peace agreement identification :**
- PAID : Peace Agreement ID
- pa_name : Peace Agreement name
- CID : Conflict ID
- Name : Conflict Name
- GWNO : Country code

**Peace agreement characteristics :**
- Inc : incompability (Territory, Government, Government/Territory)
- pa_date : PA signature date
- ended
- duration : date when PA is ended

**Comments on the peace agreement :**
- pa_comment : comments on the pa
- c_duration :  comment on agreement duration
- Link to fulltext agreement : pdf link

**Peace agreement type characteristics :**
- Mil_prov : Behavioral conduct of the warring parties
- Pol_prov : Regulation of governmental incompatibility
- Terr_prov : Regulation of territorial incompatibility
- Justice_prov : Justice issues




**Reference : **
Högbladh, Stina, 2011. ”Peace agreements 1975-2011 - Updating the UCDP Peace Agreement dataset”, in Pettersson Therése & Lotta Themnér (eds.), 2012, States in Armed Conflict 2011, Uppsala University: Department of Peace and Conflict Research Report 99
Harbom, Lotta, Stina Högbladh and Peter Wallensteen. 2006. “Armed Conflict and Peace Agreements.” Journal of Peace Research 43(5)

In [None]:
cols = ['PAID', 'GWNO', 'CID', 'Name', 'Inc', 'pa_name', 'pa_date ', 'pa_comment', 'ended', 'Duration', 'c_duration', 
        'Link to fulltext agreement', 'Mil_prov', 'Pol_prov', 'Terr_prov', 'Justice_prov']

In [None]:
pa_clean = pa[cols]
#pa_clean.head(60)
pa_clean

In [None]:
reader = csv.reader(open('data/translate_conf.csv', 'r'))

#Create a dictionnary that will have {New ID : Old ID} so we can make the link
d = {}
for row in reader:
    k, v = row
    d[v] = k

pa_clean.CID = pa_clean.CID.astype(dtype=str)
pa_clean.CID = pa_clean.CID.apply(lambda x: d[x])

In [None]:
reader = csv.reader(open('data/GWNO.csv', 'r'))
#Create a dictionnary that will have {New ID : Old ID} so we can make the link
d = {}
for row in reader:
    k, v = row
    d[k] = v

In [None]:
def clean2sides(sides):
    codec = ''
    for side in sides.split(', '):
        codec += d[side] + ','
    codec = codec.strip()
    return codec[:-1] #remove last comma

In [None]:
#pa_clean['location'] = pa_clean.GWNO.astype(dtype=str).replace(to_replace=d )
pa_clean['location'] = pa_clean.GWNO.astype(dtype=str).apply(clean2sides)
pa_clean

In [None]:
pa_clean.drop(columns='GWNO', inplace=True)

In [None]:
#pa_clean.location

In [None]:
# change yugoslavia to Serbia,Croatia

# takes a string as input and returns the cleaned string
def rmv_last_space(sides):
    sides = re.sub(r'\([^)]*\)', '', sides)
    sides = re.sub(', ', ',', sides)
    sides = re.sub(' ,', ',', sides)
    return sides.strip() # removes the space after the string
pa_clean.location = pa_clean.location.apply(rmv_last_space)

In [None]:
manual_matchings = {
    'Serbia' : 'Republic of Serbia',
    'Yugoslavia' : 'Republic of Serbia,Croatia',
    'Cote D’Ivoire' : 'Ivory Coast',
    'Bosnia-Herzegovina' : 'Bosnia and Herzegovina',
    'Tanzania' : 'United Republic of Tanzania',
    'Comoros' : 'Madagascar',
    'South Yemen' : 'Yemen',
    'DR Congo' : 'Democratic Republic of the Congo',
    'Hyderabad' : 'India',
    'South Vietnam' : 'Vietnam',
    'FYR' : 'Macedonia',
    'Grenada' : 'Spain',
    'Rumania' : 'Romania',
    'Serbia' : 'Republic of Serbia',
    'Congo' : 'Republic of the Congo',
    'Guinea-Bissau' : 'Guinea Bissau',
}
def replace_names(location):
    names = list(set([manual_matchings[x] if x in manual_matchings else x for x in location.split(',')]))
    return ','.join(names)
pa_clean.location = pa_clean.location.apply(replace_names)

In [None]:
data = json.load(open('./data/countries.geo.json'))
# build location -> locID dictionnary
loc_id_dict = dict()
for country in data['features']:
    loc_id_dict[country['properties']['name']] = country['id']
    if country['id'] == '-99':
        print(country['properties']['name'])
        
def getLocationIds(locations):
    ids = []
    for loc in locations.split(','):
        ids.append(loc_id_dict[loc])
    return ','.join(ids)
        
# Add locationID to the dataframe
pa_clean['locationID'] = pa_clean.location.apply(getLocationIds)
pa_clean.head(30)

In [None]:
# Clean 'Duration' column
def clean_date(date):
    date = str(date)
    # remove time
    date = date.split(' ', 1)[0]
    # add day if it's missing
    if(date[-1]== '-'):
        date = date + '01'
    # if there is only the year
    if('-' not in date and date != 'nan'):
        date = date + '-01-01'
    # fill nans (a nan means the PA is still on today)
    if(date == 'nan'):
        date = '2099-01-01'
    return date.strip()

pa_clean.Duration = pa_clean.Duration.apply(lambda x: clean_date(x))

In [None]:
print(pa_clean.shape)
pa_clean = pa_clean[(pa_clean['Duration']!='nan') | (pa_clean['ended']==0)]
pa_clean.shape

In [None]:
pa_clean.rename({'Duration':'enddate', 'Link to fulltext agreement':'link', 'pa_date ':'pa_date'}, axis=1, inplace=True)
pa_clean

In [None]:
pa_clean.to_csv('./frontend/data/peace_agreements.csv', sep='\t',index=False)