# Parsing

This script parses raw texts into a spreadsheet.

In [2]:
import os
import re
import csv
from operator import itemgetter
from itertools import groupby

## Some helpful functions

In [3]:
def mergeLines(l):
    '''
    This function takes in a list of lines and merge broken paragraph lines 
    (merge all lines if they don't start with a number)
    '''
    i = 0
    while i < len(l):
        if not l[i][0].isdigit():
            l[i-1:i+1] = [' '.join(l[i-1:i+1])]
        else:
            i = i+1
    return(l)

In [4]:
# function to find main paragraphs numbers in each upr
def mainParagraphs(upr):
    '''
    This function takes in a upr and returns the main paragraph numbers in the 'recommendations' section.
    There are usually 2-4 main paragraphs. Sometimes I refer to these main paragraph sections as "chunks".
    '''
    firstParagraph = upr['text'][0].partition(" ")[0]
    if '.' in firstParagraph:
            firstParagraph = firstParagraph.replace(".","")
    mainParagraphs = []

    for line in upr['text']:
        paragraph = line.partition(" ")[0]
        if paragraph[-1] == '.':
            paragraph = paragraph[:-1]
                
        mainParagraphs.append(float(paragraph))
        
    # make a list of the main paragraph numbers
    mainParagraphs = set([int(n) for n in mainParagraphs if int(n)>= int(firstParagraph)])
    return mainParagraphs   

## Read in Data

This for loop will read in each UPR in a certain date (e.g. 2013) and parse it into a dictionary with `country`, `year`, and `text` keys. The `text` value will be a list of sub-paragraphs in the recommendations section of the UPR.

In [62]:
l=[]
dir = '../Data/raw-data/txts'
for file_name in os.listdir(dir):
    broken = []
    if file_name.endswith(".txt") and '2008' in file_name:
        print 'processing ' + file_name + '...'
        try:
            dic = {}
            dic['country'] = file_name[:-8]
            dic['year'] = file_name[-8:-4]
            f = open(dir + '/' + file_name,'rU')
            x = f.read() # read in text
            f.close
            x = x.split('\n') # make a list
            x = filter(None, x) # get rid of empty string items       
             
            # take only the conclusions and/or recommendations section
            ConclusionsStart = x.index([s for s in x if "conclusions and/or recommendations" in s.lower()][1]) #startin from bottom
            ConclusionsEnd = x.index([s for s in x if "conclusions and/or recommendations" in s.lower()][2]) # the last one is the disclaimer
            x = x[ConclusionsStart+1:ConclusionsEnd+1] 
            
            # get rid of the weird lines
            x = [s for s in x if '**' not in s]
            x = [s for s in x if 'recommendations have not been edited.' not in s]
            x = [s for s in x if 'recommendations will not be edited.' not in s]
            x = [s.replace('\xd2','') for s in x]
            x = [s.replace('\t','') for s in x]
            x = [s.lstrip(" ") for s in x]
            
            # get rid of footnotes:
            for s in x:
                if s.rstrip()[-1].isdigit():
                    x[x.index(s)] = s.rstrip()[:-1]
            for s in x:
                if s.rstrip()[-1].isdigit():
                    x[x.index(s)] = s.rstrip()[:-1]

            # merge lines so that each line is its own paragraph, starting with a paragraph number
            x = mergeLines(x)
                       
            # get rid of that disclaimer paragraph
            x = [s for s in x if 'endorsed by the working group' not in s.lower()]
            
            # make sure everything is a string
            dic['text'] = [str(s) for s in x]
            
            # append to list
            l.append(dic)
              
        except Exception,e:
            broken.append(file_name +str(e)) 

processing algeria2008.txt...
processing argentina2008.txt...
processing bahrain2008.txt...
processing benin2008.txt...
processing brazil2008.txt...
processing czechrepublic2008.txt...
processing ecuador2008.txt...
processing finland2008.txt...
processing france2008.txt...
processing gabon2008.txt...
processing ghana2008.txt...
processing guatemala2008.txt...
processing india2008.txt...
processing indonesia2008.txt...
processing japan2008.txt...
processing korea2008.txt...
processing mali2008.txt...
processing morocco2008.txt...
processing netherlands2008.txt...
processing pakistan2008.txt...
processing peru2008.txt...
processing philippines2008.txt...
processing poland2008.txt...
processing romania2008.txt...
processing southafrica2008.txt...
processing srilanka2008.txt...
processing switzerland2008.txt...
processing tonga2008.txt...
processing tunisia2008.txt...
processing ukraine2008.txt...
processing unitedkingdom2008.txt...
processing zambia2008.txt...


In [63]:
# check if anything broke
broken

[]

## If the document does not follow the 67.1, 67.2 format, fix it:

In [64]:
def getnumber(line):
            try:
                number = int(line.partition(".")[0])
            except:
                number= int(line.partition(" ")[0])
            return(number)

In [65]:
for upr in range(len(l)):
    x = l[upr]['text'] 
    firstParagraph = x[0].partition(".")[0]
    secondParagraph = x[1].partition(".")[0]

    if firstParagraph == secondParagraph:
        pass

    else:
        try:
            # make a list of paragraph numbers as they appear in document
            numbers = []
            for line in x:
                numbers.append(getnumber(line))   
        
            # group that list by sequential numbers
            grouped = []
            for k, g in groupby(enumerate(numbers), lambda (i,x):i-x):
                grouped.append(map(itemgetter(1), g))
            
            # tack on main paragraph number to subparagraphs
            groupedNumbers = []
            for list in grouped:
                if list[0] != 1:
                    n = list[-1]
                    groupedNumbers.append([str(i) for i in list])
                if list[0] == 1:
                    groupedNumbers.append([str(n)+'.'+str(number) for number in list])
            
            # make the list presentable
            groupedCollapsed = [item for sublist in groupedNumbers for item in sublist]
        
            # make new statements
            numberedStatements = []
            for i in range(len(x)):
                text = x[i]
                woParagraph = text[len(str(numbers[i]))+1:]
                withnewParagraph = groupedCollapsed[i] + ' ' + woParagraph
                numberedStatements.append(withnewParagraph)
    
            l[upr]['text'] = numberedStatements
        
        except Exception,e:
            print str(e)
            broken.append(l[upr]['country'] + l[upr]['year']+str(e))

In [66]:
# take a look at the first rec of the first 10 uprs
for x in l[:10]:
    print x['text'][1]

69.1   The establishment of an international round table to discuss the interrelation between security and fundamental freedoms (Kuwait). 
64.1   To pursue its efforts to counter discrimination in whatever form, in particular towards the most vulnerable sectors of the population in the follow-up to the Durban Conference and the recommendations of the Committee on the Elimination of Racial Discrimination. (Algeria, Republic of Korea, Nigeria, Mexico); 
56.1   Recommended that Benin strengthen its cooperation with special procedures of the Human Rights Council, by accepting visits, replying to communications, urgent measures as well as to questions by special procedures (Mexico); 
83.1   Continue and intensify its efforts to reduce poverty and social inequality (Belgium); 
44.1   To take all measures to prevent any kind of reappearance of Nazism and not to let any such acts go unpunished (Russian Federation); 
50.1   To increase the focus, targeted efforts and effective measures regardin

In [67]:
def parse2014(document):
    '''
    This function passes a UPR and parse the 'text' value into well structured format, 
    i.e. a nested (array-like) dictionary of 2-4 main paragraphs, with each main paragraph containing 
    the text of the paragraph, the decision it represents (consider, accept, reject, unknown), 
    and a list of specific recommendations or subparagraphs ('items') under it.
    
    For example, in one upr the returned 'paragraphs' value will contain something like:
    {128: {'decision': 'consider', 'text': string-of-paragraph, 'items': [rec 1, rec 2, ...]}
    {129: {'decision': 'accept', 'text': string-of-paragraph, 'items': [rec 1, rec 2, ...]}
    {130: {'decision': 'reject', 'text': string-of-paragraph, 'items': [rec 1, rec 2, ...]}
    '''
def parse(document):
    
    upr = document
    
    # separate upr out into sections by mainP, i.e. the support, reject, consider chunks 
    # by identifying subparagraphs that start with the main paragraph numbers.
    sections = []
    for n in mainParagraphs(upr):
        dic = {}
        dic['paragraph'] = n
        dic['text'] = [s for s in upr['text'] if s.startswith(str(n))]
        sections.append(dic)
    
    # delete all the sections with only 1 paragraph
    for dic in sections:
        if len(dic['text']) == 1:
            sections.remove(dic)
    
    # parse into main-text and items
    for dic in sections:
        dic['header'] = dic['text'][0]
        dic['items'] = dic['text'][1:]
    
    # assign a decision
    for dic in sections: 
        text = dic['header'] 
        decision = ''
        if 'implemented' in text or 'process of implementation' in text:
            decision = 'implemented'
        elif 'will be examined' in text or 'will examine' in text or "further examined" in text or "Responses to the following recommendations will be provided" in text or "will be included in the outcome report" in text or "will be provided in due course" in text or "course of the discussion" in text:
            decision = 'consider'
        elif 'not enjoy the support' in text or 'reject' in text or 'cannot be accepted' in text:
            decision = 'reject'
        elif 'support' in text and 'did not enjoy the support' not in text:
            decision = 'support'
        elif 'have been noted by' in text or 'were noted by' in text:
            decision = 'noted'
        elif 'do not reflect the current situation' in text:
            decision = 'reject'
        else:
            decision = 'unknown'
        dic['decision'] = decision
    
    # break down multiple paragraph items
    for dic in sections:
        l = []
        if dic['items'] is not None:
            for line in dic['items']:
                n = line.partition(" ")[0] # main par. (chunk) number
                d = ');' # delimiter 
                s = [e+d for e in line.split(d) if e != ""] # split by delimiter, but keep the delimiter.
                s = [e for e in s if e.strip()!=');'] # remove lines containing just the delimiter
                f = [s[0]] # start a new list
                for item in s[1:]:
                    f.append(str(n)+'.'+str(s.index(item))+item) # add items to new list, looking as it should
                l.append(f) # add newly parsed list to main list of UPR recommendations
            l = [i for subitem in l for i in subitem] # flatten list
            dic['items'] = l # assign
        
    return(sections)

In [68]:
# apply to all docs
for i in l:
    try:
        i['sections'] = parse(i)
    except Exception,e:
        print str(l.index(doc)) + str(e)

In [69]:
# check it out
print l[0]['sections'][0]['decision']
print l[0]['sections'][0]['items'][:5]

support
['69.1   The establishment of an international round table to discuss the interrelation between security and fundamental freedoms (Kuwait). );', '69.2   That Algeria share its experience in the fight against terrorism and pursue its efforts to combat poverty and promote economic, social and cultural rights with a view to achieving the Millennium Development Goals by 2015 (Sudan). );', '69.3   That Algeria take appropriate measures to address violence against children; recommends the continuation of the moratorium on death penalty (Italy). );', '69.4   That the State strengthen its efforts to protect women\xd5s rights, including by addressing cultural and social barriers, as well as legal ones; recommends that Algeria cooperate with special procedures mandate holders; recommends that steps be taken to guarantee the rights of detainees, including immediate access to a lawyer, information to families on detentions, and ensuring that judicial authorities are informed of all detenti

## Testing Testing 123

In [70]:
# Take a look at the main paragraphs
for i in l:
    for section in i['sections']:
        print section['decision']
        print section['header'] + ('\n')

### TODO ####
# export this data somehow

support
69  The recommendations formulated during the interactive dialogue have been examined by Algeria and the recommendations listed below enjoy the support of Algeria: 

support
64   The recommendations formulated during the interactive dialogue have been examined by Argentina and the recommendations listed below enjoy the support of Argentina:  

consider
56  In the course of the discussion, the following recommendations were made to Benin: 

support
83   The recommendations formulated during the interactive dialogue have been examined by Brazil and enjoy the support of Brazil: 

consider
44  In the course of the discussion, the following recommendations were made to the Czech  Republic: 

unknown
50   Finland considered the recommendations made during the interactive dialogue and listed below, and agrees to follow up on them: 

consider
60  In the course of the discussion, the following recommendations were made to France: 

consider
60   In the course of the discussion, the foll

In [71]:
# which decisions are 'unknown'?
for i in l:
    for section in i['sections']:
        if section['decision'] == 'unknown':
            print i['country'] +i['year'] + ('\n')
            print section['header'] + ('\n')

finland2008

50   Finland considered the recommendations made during the interactive dialogue and listed below, and agrees to follow up on them: 

india2008

86   In the course of the interactive dialogue the following recommendations were made:  

pakistan2008

108   Pakistan considers that recommendations contained in paragraphs 23(b) and (f), 27(b), 30(b) and (d), 43(c),  and 62(b) and (e) in chapter II above are neither universally recognized  human rights nor conform to its existing laws, pledges and commitments, and cannot accept them. 

philippines2008

60   During the first session of the Working Group, on 11 April 2008, the Philippines, taking into consideration the recommendations listed above, announced the following voluntary commitments:  (a)  To continue to develop a gender-responsive approach to issues on women and children, including in the judicial system and on violence against women and children  (b)  To continue to develop domestic legislation for further protection

In [58]:
# write paragraphs with no subitems

with open('specialpars.txt', 'a') as f:
    for i in l:
        for section in i['sections']:
            if not section['items']:
                f.write(i['country'] + i['year'] + ('\n'))
                f.write(section['header'] + ('\n\n'))


In [72]:
# make dictionaries for each individual recommendation item
# containing the text, to country, from country, and decision
# concatenate into a list.
reclist = []
for upr in l:
    for section in upr['sections']:
        for item in section['items']:
            dic = {}
            dic['to'] = upr['country']
            dic['year'] = upr['year']
            dic['decision'] = section['decision']
            dic['from'] = item.split('(')[-1].split(')')[0]
            dic['text'] = item
            reclist.append(dic)  

# uncomment to test
print len(reclist)
reclist[1]      

858


{'decision': 'support',
 'from': 'Sudan',
 'text': '69.2   That Algeria share its experience in the fight against terrorism and pursue its efforts to combat poverty and promote economic, social and cultural rights with a view to achieving the Millennium Development Goals by 2015 (Sudan). );',
 'to': 'algeria',
 'year': '2008'}

## write csv

In [60]:
#writing column headings
keys = reclist[0].keys()
keys

['text', 'to', 'decision', 'from', 'year']

In [61]:
#writing the rest
with open('../Data/year-csvs/2008.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(reclist)