# Codebook

In [1]:
import os, glob, re
import pandas as pd

In [2]:
ptrn = os.path.join('.', 'data', '**', '*.txt')
data = pd.DataFrame(columns=['org', 'article', 'analysis', 'level', 'cell', 'code'])
rep = lambda s, n: [ s for i in range(n) ]
for fn in glob.iglob(ptrn, recursive=True):
    org, article, analysis = fn.split('/')[2:5]
    with open(fn, encoding='latin-1') as f:
        txt = f.read()
    txt = re.sub(r'([HL]L)', r'\n\1', txt)
    codes = re.findall(r'([HL]L)\s\[([^\]]+)\]\s([^\n]+)\n', txt)
    df = pd.DataFrame({
        'org': rep(org, len(codes)),
        'article': rep(article, len(codes)),
        'analysis': rep(analysis[22:-3] + 'pdf', len(codes)),
        'level': [ c[0] for c in codes ],
        'cell': [ c[1] for c in codes ],
        'code': [ c[2].strip() for c in codes ]
    })
    data = data.append(df)

data.head()

Unnamed: 0,org,article,analysis,level,cell,code
0,buzzfeednews,2019-04-democratic-candidate-codonors,analyze-campaign-codonors.html.pdf,HL,paragraph 1,Analyze government data
1,buzzfeednews,2019-04-democratic-candidate-codonors,analyze-campaign-codonors.html.pdf,HL,paragraph 1,Look for trends in quarterly data
2,buzzfeednews,2019-04-democratic-candidate-codonors,analyze-campaign-codonors.html.pdf,LL,2,Load political candidate data
3,buzzfeednews,2019-04-democratic-candidate-codonors,analyze-campaign-codonors.html.pdf,LL,4,Create reusable data wrangling subroutine
4,buzzfeednews,2019-04-democratic-candidate-codonors,analyze-campaign-codonors.html.pdf,LL,4,Import .fec (ASCII) file


Count the number of individual codes per article

In [3]:
data.groupby(['org', 'article'])['article'].count() \
    .to_frame('code counts') \
    .reset_index() \
    .sort_values('code counts', ascending=False)

Unnamed: 0,org,article,code counts
0,TheOregonian,long-term-care-db,59
2,la_times,california-ccscore-analysis,41
1,buzzfeednews,2019-04-democratic-candidate-codonors,36
3,la_times,california-crop-production-wages-analysis,34
4,la_times,census-hard-to-map-analysis,11


Calculate the frequency of codes across articles

In [4]:
data.groupby(['level', 'code']).count()['org'] \
    .to_frame('count') \
    .sort_values(['level', 'count'], ascending=False) \
    .reset_index()

Unnamed: 0,level,code,count
0,LL,Peek at data,6
1,LL,Count total rows,3
2,LL,Output intermediate results,3
3,LL,Count total records,2
4,LL,"Create plot, histogram with annotations",2
5,LL,Export intermediate results,2
6,LL,Find the maximum values,2
7,LL,Subset dataset by explicit columns,2
8,LL,Adjust for inflation,1
9,LL,Adjust table display,1
