In [1]:
import re

### Grab the following fields from the codebook(s) for each question:
- Question Number
- Question
- Variable Label
- Values
- Value Labels
- Source
- Note

In [2]:
with open('../data/Afrobarometer/merged_round_6_codebook_20161.txt', 'r') as w:
    filelines = w.readlines()

In [3]:
print('Num of lines in file: {:,}'.format(len(filelines)))

Num of lines in file: 4,850


Questions to make sure are parsed correctly:
- Q16 (66)
- Q21 (73)
- Q24E (83)
- Q27D (96)
- Q28D_ARB (101)
- Q29a (102)
- Q29b (103)
- Q29g (108)
- Q42C (127)
- Q52A (150)
- withinwt (327)

In [4]:
num_questions = len([a for a in filelines if re.search('^Question Number:', a)])
print('Num of questions to parse: {:,}'.format(num_questions))    

Num of questions to parse: 328


In [5]:
questions = []
started = False
question = []

for line in filelines:
    if (not started) and (re.search('^Question Number:', line)):
        # start a new question
        question.append(line)
        started = True
    elif started:
        # we are in a question
        if re.search('^Question Number:', line) or re.search('^Appendix 1:', line):
            # we are starting a new question,
            # so wrap up the last one
            questions.append(question)
            question = []                
            
            if re.search('^Appendix 1:', line):
                started = False
            
        # add this line to the current question
        question.append(line)
            
print('Num questions gathered: {:,}'.format(len(questions)))

Num questions gathered: 328


In [8]:
sanity_check = [66, 73, 83, 96, 101, 102, 103, 108, 127, 150, 327]
labels = re.compile(r'^((Question Number|Question|Variable Label|Values|Value Labels|Source|Note)\s*):')

In [7]:
questions[327]

['Question Number: withinwt \n',
 'Variable Label: Within country weighting factor \n',
 'Note: The weighting variable adjusts the distribution of the sample based on individual selection probabilities (i.e. based on region, gender, urban-rural distribution, and size of household and enumeration area). \n']

In [99]:
q_dict = {}
last_label = None
current_line = ''

for line in questions[38]:
    if labels.search(line):
        if last_label is not None:
            # add the last label to the dictionary
            q_dict[last_label] = current_line
        last_label = labels.search(line).group(1).strip()
        current_line = line[line.find(last_label)+len(last_label)+1:].strip()
    else:
        current_line = '{} {}'.format(current_line, line.strip())
# add the last label        
q_dict[last_label] = current_line
print(q_dict.items())

dict_items([('Question Number', 'Q2'), ('Question', 'Which language is your home language?'), ('Variable Label', 'Q2. Language of respondent'), ('Values', '1-35, 101- 107, 141-149, 180- 197, 220-221, 260- 278, 300-315, 340-342, 381-396, 420-421,460-471, 502-518, 540-553, 581-591, 621- 653, 660-668, 702-710, 740-800, 820- 872, 900, 930- 943, 1100-1105, 1141- 1160, 1180, 1220 -1282, 1300-1305, 1420, 1460,1501,1540,1541,1620,1621,1660,1661,1662, 1700-1707, 2200-2222, 2740-2748, 9998-9999'), ('Value Labels', "-1 =Missing, 1 =English, 2 =French, 3 =Portuguese, 4 =Swahili, 5 =Arabic, 6 =Adja, 7 =Afrikaans, 8 =Arabe, 9 =Bambara, 10 =Bassa, 11 =Berber Language, 12 =Bissa, 13 =Bobo, 14 =Chewa, 15 =Dioula, 16 =Ewe, 17 =Fon, 18 ,Fulfuldé, 19 =Haoussa, 20 =Hausa, 21 =Kanuri, 22 =Kissi, 23 =Lobi, 24 =Mano, 25 =Mende, 26 =Ndau, 27 =Ndebele, 28 =Nyanja, 29 =Peulh, 30 =Sénoufo, 31 =Setswana, 32 =Tamasheq, 33 =Tonga, 34 =Venda, 35 =Yoruba, 102 =Bariba, 103 =Dendi, 105 =Otamari, 107 =Lopka, 141 =Sesarwa

In [16]:
def parse_question(question):
    q_dict = {}
    last_label = None
    current_line = ''

    for line in question:
        if labels.search(line):
            if last_label is not None:
                # add the last label to the dictionary
                q_dict[last_label] = current_line.strip()
            last_label = labels.search(line).group(1).strip()
            current_line = line[line.find(last_label)+len(last_label)+1:].strip()
        else:
            current_line = '{} {}'.format(current_line, line.strip())
    # add the last label        
    q_dict[last_label] = current_line.strip()
    return q_dict

In [17]:
parsed_questions = list(map(parse_question, questions))
print('Num parsed questions: {:,}'.format(len(parsed_questions)))

Num parsed questions: 328


In [19]:
parsed_questions[sanity_check[2]]

{'Question': 'During the past year, how often have you contacted any of the following persons about some important problem or to give them your views: Traditional Leaders?',
 'Question Number': 'Q24E',
 'Source': 'Adapted from Zambia96 *Not asked in STP, MAU, CVE',
 'Value Labels': '0=Never, 1=Only once, 2=A few times, 3=Often, 9=Don’t know, 98=Refused to answer, -1=Missing',
 'Values': '0-3, 9, 98, -1',
 'Variable Label': 'Q24e. Contact traditional leader'}

In [13]:
set([b for a in parsed_questions for b in a])

{'Note',
 'Question',
 'Question Number',
 'Source',
 'Value Labels',
 'Values',
 'Variable Label'}