In [1]:
import os
import re
import csv
from operator import itemgetter
from itertools import groupby

## Some helpful functions

In [2]:
def mergeLines(l):
    '''
    This function takes in a list of lines and merge broken paragraph lines 
    (merge all lines if they don't start with a number)
    '''
    i = 0
    while i < len(l):
        if not l[i][0].isdigit():
            l[i-1:i+1] = [' '.join(l[i-1:i+1])]
        else:
            i = i+1
    return(l)

In [3]:
# function to find main paragraphs numbers in each upr
def mainParagraphs(upr):
    '''
    This function takes in a upr and returns the main paragraph numbers in the 'recommendations' section.
    There are usually 2-4 main paragraphs. Sometimes I refer to these main paragraph sections as "chunks".
    '''
    firstParagraph = upr['text'][0].partition(" ")[0]
    if '.' in firstParagraph:
            firstParagraph = firstParagraph.replace(".","")
    mainParagraphs = []

    for line in upr['text']:
        paragraph = line.partition(" ")[0]
        if paragraph[-1] == '.':
            paragraph = paragraph[:-1]
                
        mainParagraphs.append(float(paragraph))
        
    # make a list of the main paragraph numbers
    mainParagraphs = set([int(n) for n in mainParagraphs if int(n)>= int(firstParagraph)])
    return mainParagraphs   

## Read in Data

This for loop will reach in each UPR in a certain date (e.g. 2013) and parse it into a dictionary with country, year, and text keys. The *text* value will be a list of the main paragraphs in the recommendations section of the UPR

In [4]:
l=[]
dir = 'raw-data/txts'
for file_name in os.listdir(dir):
    broken = []
    if file_name.endswith(".txt") and '2013' in file_name:
        print 'processing ' + file_name + '...'
        try:
            dic = {}
            dic['country'] = file_name[:-8]
            dic['year'] = file_name[-8:-4]
            f = open(dir + '/' + file_name,'rU')
            x = f.read() # read in text
            f.close
            x = x.split('\n') # make a list
            x = filter(None, x) # get rid of empty string items       
             
            # take only the conclusions and/or recommendations section
            ConclusionsStart = x.index([s for s in x if "conclusions and/or recommendations" in s.lower()][1]) #startin from bottom
            ConclusionsEnd = x.index([s for s in x if "conclusions and/or recommendations" in s.lower()][2]) # the last one is the disclaimer
            x = x[ConclusionsStart+1:ConclusionsEnd+1] 
            
            # get rid of the weird lines
            x = [s for s in x if '**' not in s]
            x = [s for s in x if 'recommendations have not been edited.' not in s]
            x = [s for s in x if 'recommendations will not be edited.' not in s]
            x = [s.replace('\xd2','') for s in x]
            x = [s.replace('\t','') for s in x]
            x = [s.lstrip(" ") for s in x]
            
            # get rid of footnotes:
            for s in x:
                if s.rstrip()[-1].isdigit():
                    x[x.index(s)] = s.rstrip()[:-1]
            for s in x:
                if s.rstrip()[-1].isdigit():
                    x[x.index(s)] = s.rstrip()[:-1]

            # merge lines so that each line is its own paragraph, starting with a paragraph number
            x = mergeLines(x)
                       
            # get rid of that disclaimer paragraph
            x = [s for s in x if 'endorsed by the working group' not in s.lower()]
            
            # make sure everything is a string
            dic['text'] = [str(s) for s in x]
            
            # append to list
            l.append(dic)
              
        except Exception,e:
            broken.append(file_name +str(e)) 

processing afghanistan2014.txt...
processing albania2014.txt...
processing angola2014.txt...
processing bhutan2014.txt...
processing bolivia2014.txt...
processing bosniaandherzegovina2014.txt...
processing bruneidarussalam2014.txt...
processing cambodia2014.txt...
processing chile2014.txt...
processing comoros2014.txt...
processing costarica2014.txt...
processing cotedivoire2014.txt...
processing cyprus2014.txt...
processing democraticpeoplesrepublicofkorea2014.txt...
processing democraticrepublicofthecongo2014.txt...
processing dominica2014.txt...
processing dominicanrepublic2014.txt...
processing egypt2014.txt...
processing elsalvador2014.txt...
processing eritrea2014.txt...
processing ethiopia2014.txt...
processing fiji2014.txt...
processing gambia2014.txt...
processing guinea2014.txt...
processing iran2014.txt...
processing iraq2014.txt...
processing italy2014.txt...
processing kazakhstan2014.txt...
processing Macedonia2014.txt...
processing madagascar2014.txt...
processing newzeal

In [5]:
# check if anything broke
broken

[]

## If the document does not follow the 67.1, 67.2 format, fix it:

In [6]:
def getnumber(line):
            try:
                number = int(line.partition(".")[0])
            except:
                number= int(line.partition(" ")[0])
            return(number)

In [7]:
for upr in range(len(l)):
    x = l[upr]['text'] 
    firstParagraph = x[0].partition(".")[0]
    secondParagraph = x[1].partition(".")[0]

    if firstParagraph == secondParagraph:
        pass

    else:
        try:
            # make a list of paragraph numbers as they appear in document
            numbers = []
            for line in x:
                numbers.append(getnumber(line))   
        
            # group that list by sequential numbers
            grouped = []
            for k, g in groupby(enumerate(numbers), lambda (i,x):i-x):
                grouped.append(map(itemgetter(1), g))
            
            # tack on main paragraph number to subparagraphs
            groupedNumbers = []
            for list in grouped:
                if list[0] != 1:
                    n = list[-1]
                    groupedNumbers.append([str(i) for i in list])
                if list[0] == 1:
                    groupedNumbers.append([str(n)+'.'+str(number) for number in list])
            
            # make the list presentable
            groupedCollapsed = [item for sublist in groupedNumbers for item in sublist]
        
            # make new statements
            numberedStatements = []
            for i in range(len(x)):
                text = x[i]
                woParagraph = text[len(str(numbers[i]))+1:]
                withnewParagraph = groupedCollapsed[i] + ' ' + woParagraph
                numberedStatements.append(withnewParagraph)
    
            l[upr]['text'] = numberedStatements
        
        except Exception,e:
            print str(e)
            broken.append(l[upr]['country'] + l[upr]['year']+str(e))

In [8]:
# take a look at the second rec of the first 10 uprs
for x in l:
    print x['text'][1]

136.1 To further build up on its effort to fully protect human rights in the country (Ethiopia); 
104.1 Sign and ratify the Optional Protocol to the ICESCR (Spain); 
134.1 Promptly ratify and implement the human rights conventions signed in September 2013 (United Kingdom of Great Britain and Northern Ireland);  
118.1 Ratify the Optional Protocol to the Convention on the Rights of the Child on the sale of children, child prostitution and child pornography and the Optional Protocol to the Convention on the Rights of the Child on the involvement of children in armed conflict (Chad); 
113.1 Incorporate the Rome Statute into national law (Mexico); 
107.1 Ratify the Optional Protocol to the Convention on the Rights of the Child on a communications procedure (Portugal); 
113.1 Ratify the other international instruments on human rights that the country is not yet a party to (Argentina); 
118.1 Ratify the first Optional Protocol to the International Covenant on Civil and Political Rights and s

In [9]:
def parse2014(document):
    '''
    This function passes a UPR and parse the 'text' value into well structured format, 
    i.e. a nested (array-like) dictionary of 2-4 main paragraphs, with each main paragraph containing 
    the text of the paragraph, the decision it represents (consider, accept, reject, unknown), 
    and a list of specific recommendations or subparagraphs ('items') under it.
    
    For example, in one upr the returned 'paragraphs' value will contain something like:
    {128: {'decision': 'consider', 'text': string-of-paragraph, 'items': [rec 1, rec 2, ...]}
    {129: {'decision': 'accept', 'text': string-of-paragraph, 'items': [rec 1, rec 2, ...]}
    {130: {'decision': 'reject', 'text': string-of-paragraph, 'items': [rec 1, rec 2, ...]}
    '''
    upr = document
    
    # assign main paragraphs to upr dictionary
    upr['mainP'] = mainParagraphs(upr)
    
    # separate upr out into chunks by mainP, i.e. the support, reject, consider chunks 
    # by identifying subparagraphs that start with the main paragraph numbers.
    upr['chunks'] = {}
    for n in upr['mainP']: # i.e. Paragraphs 123, 124, 125.
        statements = [s for s in upr['text'] if s.startswith(str(n))] # find a list of lines in each main chunk 
        upr['chunks'][n] = statements # assign each chunk
    del upr['text']
    
    # parse each chunk into paragraph texts and subitems, assign to 'paragraphs' key 
    upr['paragraphs'] = {}
    for key in upr['chunks'].keys():
        dic = {}
        dic['text'] = upr['chunks'][key][0] # the main text is always the first paragraph, i.e. the first line.
        if len(upr['chunks'][key]) > 1: # find subitems. (everything after 1st line)
           dic['items'] = [s for s in upr['chunks'][key][1:]]
        else:
            dic['items'] = None # some paragraphs have no subitems.
        upr['paragraphs'][key] = dic # assign
    del upr['chunks']
    
    # assign decision:
    for x in upr['paragraphs'].keys():
        text = upr['paragraphs'][x]['text'] 
        decision = ''
        if 'implemented' in text or 'process of implementation' in text:
            decision = 'implemented'
        elif 'will be examined' in text or 'will examine' in text or "further examined" in text or "Responses to the following recommendations will be provided" in text or "will be included in the outcome report" in text or "will be provided in due course" in text or "course of the discussion" in text:
            decision = 'consider'
        elif 'not enjoy the support' in text or 'reject' in text or 'cannot be accepted' in text:
            decision = 'reject'
        elif 'support' in text and 'did not enjoy the support' not in text:
            decision = 'support'
        elif 'have been noted by' in text or 'were noted by' in text:
            decision = 'noted'
        elif 'do not reflect the current situation' in text:
            decision = 'reject'
        else:
            decision = 'unknown'
        upr['paragraphs'][x]['decision'] = decision
    
    # break down multiple paragraph items
    for key in upr['paragraphs'].keys():
        l = []
        if upr['paragraphs'][key]['items'] is not None:
            for line in upr['paragraphs'][key]['items']:
                n = line.partition(" ")[0] # main par. (chunk) number
                d = ');' # delimiter 
                s = [e+d for e in line.split(d) if e != ""] # split by delimiter, but keep the delimiter.
                s = [e for e in s if e.strip()!=');'] # remove lines containing just the delimiter
                f = [s[0]] # start a new list
                for item in s[1:]:
                    f.append(str(n)+'.'+str(s.index(item))+item) # add items to new list, looking as it should
                l.append(f) # add newly parsed list to main list of UPR recommendations
            l = [i for subitem in l for i in subitem] # flatten list
            upr['paragraphs'][key]['items'] = l # assign
        
    return(upr)

In [10]:
# apply function to the uprs.
newlist = []
for doc in l:
    try:
        newlist.append(parse2014(doc))
    except Exception,e:
        print str(l.index(doc)) + str(e)

In [11]:
# check it out
newlist[-1]

{'country': 'yemen',
 'mainP': {115, 116, 117},
 'paragraphs': {115: {'decision': 'support',
   'items': ['115.1 Ratify the Rome Statute of the ICC (Botswana );',
    '115.2 Accelerate its process to ratify the Rome Statute (Republic of Korea);',
    '115.3 Ratify the Rome Statute that Yemen signed in 2000 and align legislation with all the obligations related to this text (France);',
    '115.4 Ratify the Rome Statute and take necessary measures to ensure its implementation in the national legislation (Switzerland);',
    '115.5 Ratify/accede to the Rome Statute of the International Criminal Court and to implement it fully at national level and to accede to the Agreement on Privileges and Immunities of the Court (Slovakia);',
    '115.6 Accelerate the legislative process for the rapid accession to the Rome Statute of the ICC and the Agreement on Privileges and Immunities of the ICC (Uruguay);',
    '115.7 Ratify the Rome Statute of the International Criminal Court and fully align its 

## Testing Testing 123

In [12]:
# Take a look at the main paragraphs
for x in newlist:
    for paragraph in x['paragraphs']:
        print x['paragraphs'][paragraph]['decision']
        print x['paragraphs'][paragraph]['text'] + ('\n')

### TODO ####
# export this data somehow

support
136. The recommendations formulated during the interactive dialogue and listed below have been examined by Afghanistan and enjoy its support: 

consider
137. The following recommendations will be examined by Afghanistan, which will provide responses in due time, but no later than the twenty-sixth session of the Human Rights Council in June 2014: 

reject
138. The recommendations below did not enjoy the support of Afghanistan: 

support
104. The recommendations formulated during the interactive dialogue and listed below have been examined by Albania and enjoy the support of Albania: 

implemented
105.  The following recommendations enjoy the support of Albania, which considers that they are in the process of implementation: 

consider
106.  The following recommendations will be examined by Albania, which will provide responses in due time, but no later than the twenty-seventh session of the Human Rights Council, in September 2014: 

support
134. The recommendations formulated du

In [13]:
# which decisions are 'unknown'?
for x in newlist:
    for paragraph in x['paragraphs']:
        if x['paragraphs'][paragraph]['decision'] == 'unknown':
            print x['country'] + x['year'] + ('\n')
            print x['paragraphs'][paragraph]['text'] + ('\n')

In [14]:
# write paragraphs with no subitems
'''
with open('specialpars.txt', 'a') as f:
    for upr in newlist:
        for paragraph in upr['paragraphs']:
            if upr['paragraphs'][paragraph]['items'] is None:
                f.write(upr['country'] + upr['year'] + ('\n'))
                f.write(upr['paragraphs'][paragraph]['text'] + ('\n\n'))
'''

"\nwith open('specialpars.txt', 'a') as f:\n    for upr in newlist:\n        for paragraph in upr['paragraphs']:\n            if upr['paragraphs'][paragraph]['items'] is None:\n                f.write(upr['country'] + upr['year'] + ('\n'))\n                f.write(upr['paragraphs'][paragraph]['text'] + ('\n\n'))\n"

In [15]:
# make dictionaries for each individual recommendation item, containing the text, to country, from country, and decision
for upr in newlist:
    l = []
    for key in upr['paragraphs'].keys():
        if upr['paragraphs'][key]['items'] is not None:
            for item in upr['paragraphs'][key]['items']:
                dic = {}
                dic['to'] = upr['country']
                dic['year'] = upr['year']
                dic['decision'] = upr['paragraphs'][key]['decision']
                dic['from'] = item.split('(')[-1].split(')')[0]
                dic['text'] = item
                l.append(dic)
    upr['recs'] = l       

In [16]:
len(newlist[-1]['recs'])

191

In [17]:
# cat recs into a list
recslist = []
for doc in newlist:
    for rec in doc['recs']:
        recslist.append(rec)
len(recslist)

7616

## write csv

In [18]:
#writing column headings
keys = recslist[0].keys()
keys

['text', 'to', 'decision', 'from', 'year']

In [19]:
#writing the rest
with open('2013data.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(recslist)