This notebook includes analysis of the iterative coding process.

In [1]:
import pandas as pd
import altair as alt
import numpy as np
from lib.util import displayMarkdown, getCitations

alt.renderers.enable('notebook')

pd.set_option("display.max_rows", None)  # Don't truncate rows when printing a Pandas DataFrame instance

# Import data 

## Code tree

This CSV file is computed in `codebooky.ipynb`.

In [2]:
codes = pd.read_csv('data/codes.csv')
codes.head()

Unnamed: 0,parent,name,desc,level,is_leaf,analysis
0,root,actions,Codes that describe actions journalists take t...,0,False,50
1,actions,import,How raw data is introduced into the programmin...,1,False,39
2,import,fetch,Data is retrieved from some external sources t...,2,False,6
3,fetch,pull tables out of pdf,"Use a table extraction tool, such as Tabula, t...",3,True,1
4,fetch,api request,Make a request to a web service,3,True,1


## Citations

In [3]:
citations = getCitations()
citations.head()

Unnamed: 0,journalist,year,month,date,analysis,organization,path
0,"Aisch, Gregor; Keller, Josh; Eddelbuettel, Dirk",2016,June,13,Analysis of NICS gun purchase background checks.,New YorkTimes,gunsales
1,"Aldhous, Peter",2016,September,16,"""Shy Trumpers"" polling analysis.",BuzzFeed News,2016-09-shy-trumpers
2,"Arthur, Rob",2015,July,30,Buster Posey MVP.,FiveThirtyEight,buster-posey-mvp
3,"Bi, Frank",2016,Jan,13,Uber launch cities and date.,Vox,verge-uber-launch-dates
4,"Bradshaw, Paul",2019,April,6,Lack of electric car charging points 'putting ...,BBC,electric-car-charging-points


## Code-analysis-notebook network

This notebook also uses the data exported by `codebook.ipynb` to `data/code-analysis-network.csv`, which maps codes to notebooks to analyses. I merge this data frame with the `citations` data frame to associate codes used with organizations and journalists.

In [4]:
analysisCodes = pd.read_csv('data/code-analysis-network.csv')
analysisCodes = pd.merge(analysisCodes, 
         citations[['organization', 'path', 'journalist']].drop_duplicates(),
         how='left',
         left_on='analysis',
         right_on='path')[['name', 'analysis', 'notebook', 'level', 'is_leaf', 'organization', 'journalist']]

analysisCodes.head()

Unnamed: 0,name,analysis,notebook,level,is_leaf,organization,journalist
0,pull tables out of pdf,2018-voter-registration,01_processing.ipynb,3.0,True,Baltimore Sun,"Zhang, Christine"
1,api request,california-h2a-visas-analysis,03_geocode.ipynb,3.0,True,Los Angeles Times,"Welsh, Ben"
2,query database,201901-achievementgap,build_data.R,3.0,True,Star Tribune,"Webster, MaryJo"
3,scrape web for data,us-weather-history,wunderground_scraper.py,3.0,True,FiveThirtyEight,"Olson, Randy"
4,scrape web for data,long-term-care-db,mung-3-25-scrape,3.0,True,The Oregonian,"Zarkhin, Fedor"


# Code Counts

Now group the code-analyses pairs in the `codes` data frame by code and count the number of analyses per code. Remember that the frequency column, called `freq`, is the number of analyses that contain at least one instance of that particular code.

In [5]:
nuniq = {
    'analysis': analysisCodes.analysis.nunique(),
    'journalist': analysisCodes.journalist.nunique(),
    'organization': analysisCodes.organization.nunique(),
}

codeCounts = analysisCodes.groupby(['name', 'level', 'is_leaf']) \
    .agg({
        'analysis': lambda x: round((len(set(x)) / nuniq.get('analysis')) * 100, 2),
        'journalist': lambda x: round((len(set(x)) / nuniq.get('journalist') * 100), 2),
        'organization': lambda x: round((len(set(x)) / nuniq.get('organization')) * 100, 2),
    }) \
    .rename(columns={
        'analysis': 'analysis_percent',
        'journalist': 'journalist_percent',
        'organization': 'organization_percent'
    }) \
    .reset_index()

codeCounts = pd.merge(codeCounts, codes[['name', 'desc']], how='left')

def recurse(root, pNode, func):
    root = root if root != None else pNode
    func(root, pNode)
    children = codes.loc[codes.parent == pNode]
    for child in children.name:
        recurse(root, child, func)

def markBranch(root, pnode):
    codeCounts.loc[codeCounts.name == pnode, 'is_' + root] = True

for branch in ['actions', 'strategies', 'observations', 'analysis', 'integrate']:
    codeCounts['is_' + branch] = False
    recurse(None, branch, markBranch) 

# Populate is leaf parent
codeCounts['is_leaf_parent'] = False
for parent in codes[codes.is_leaf].parent.unique():
    codeCounts.loc[codeCounts.name == parent, 'is_leaf_parent'] = True

# Bin coverage into discrete values
bins = ['abundently', 'frequently', 'occasionally', 'significantly', 'seldom', 'rarely']
codeCounts['commonness'] = pd.cut(codeCounts.journalist_percent, len(bins), labels=bins[::-1])

# # Peek at results
codeCounts.head()

Unnamed: 0,name,level,is_leaf,analysis_percent,journalist_percent,organization_percent,desc,is_actions,is_strategies,is_observations,is_analysis,is_integrate,is_leaf_parent,commonness
0,API request,3.0,True,2.0,3.03,3.85,Make a request to a web service,True,False,False,False,False,False,rarely
1,Create child table,3.0,True,2.0,3.03,3.85,A child table is a subset of the parent table ...,False,True,True,False,False,False,rarely
2,Freedom of Information data,2.0,True,2.0,3.03,3.85,Data that was obtained via FOI/FOIA requests,False,False,True,False,False,False,rarely
3,Validate data quality with domain-specific rules,3.0,True,2.0,3.03,3.85,Such as if the average temperature is higher t...,True,False,False,False,False,False,rarely
4,actions,0.0,False,102.0,103.03,103.85,Codes that describe actions journalists take t...,True,False,False,False,False,True,abundently


In [10]:
uniqCodes = analysisCodes[['name', 'is_leaf']].drop_duplicates()

displayMarkdown('There are {open} open codes and {axial} axial codes.'.format(**{
    'open': len(uniqCodes[uniqCodes.is_leaf == True]),
    'axial': len(uniqCodes[uniqCodes.is_leaf == False])
}))

There are 131 open codes and 44 axial codes

# Code Prevalence

The table below provides a way to qualitatively evaluate the prevalence of certain codes in the corpus, by the number of analyses, the number of journalists, or the number of organizations.

In [6]:
codeLevel = codeCounts[codeCounts.level > 0]

codeLevel.loc[~codeLevel.is_actions, 'category'] = 'Observation'
codeLevel.loc[codeLevel.is_actions, 'category'] = 'Action'

codeLevel = codeLevel.sort_values(['category', 'level', 'commonness', 'analysis_percent'], 
                      ascending=[True, True, False, False])

for col in ['analysis_percent', 'journalist_percent', 'organization_percent']:
    codeLevel[col] = codeLevel[col].apply(str) + '%'

codeLevel[['category', 'level', 'name', 'commonness' , 'analysis_percent', 'journalist_percent', 'organization_percent']] \
    .sort_values('level', ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,category,level,name,commonness,analysis_percent,journalist_percent,organization_percent
88,Action,1.0,import,frequently,80.0%,84.85%,88.46%
49,Observation,1.0,data acquisition,occasionally,46.0%,57.58%,57.69%
103,Observation,1.0,pain points,significantly,34.0%,42.42%,50.0%
141,Observation,1.0,strategies,significantly,32.0%,45.45%,53.85%
171,Observation,1.0,workflow building,significantly,32.0%,39.39%,42.31%
174,Observation,1.0,wrangling purpose,frequently,58.0%,69.7%,84.62%
64,Action,1.0,export,significantly,38.0%,45.45%,50.0%
8,Observation,1.0,analysis,abundently,100.0%,100.0%,100.0%
153,Action,1.0,transform,occasionally,56.0%,66.67%,69.23%
25,Action,1.0,check sanity,frequently,62.0%,72.73%,76.92%


## Analysis of integration codes

In [7]:
integrationCodes = ['union tables', 'inner join tables', 'outer join tables', 'full join tables', 'concat tables', 'use lookup table', 'cartesian product', 'self join table']
generateKeyCodes = ['create soft key', 'create a unique key']

intKeyAnalysis = analysisCodes[analysisCodes.name.isin(integrationCodes + generateKeyCodes)].copy()
intKeyAnalysis['code_group'] = 'integration'
intKeyAnalysis.loc[intKeyAnalysis.name.isin(generateKeyCodes), 'code_group'] = 'key generation'
intKeyAnalysis = intKeyAnalysis[['code_group', 'name', 'analysis']]

intKeyAnalysis

Unnamed: 0,code_group,name,analysis
121,key generation,create soft key,2016-04-republican-donor-movements
122,key generation,create soft key,2019-04-democratic-candidate-codonors
123,key generation,create soft key,california-h2a-visas-analysis
124,key generation,create soft key,2018-05-31-crime-and-heat-analysis
125,key generation,create soft key,babyname_politics
126,key generation,create a unique key,new-york-schools-assessment
127,key generation,create a unique key,2015-11-refugees-in-the-united-states
128,key generation,create a unique key,infrastructure-jobs
129,key generation,create a unique key,california-h2a-visas-analysis
130,key generation,create a unique key,prison-admissions


In [9]:
intKeyGrp = intKeyAnalysis.groupby('analysis').agg({ 
    'name': lambda names: 'create soft key' in list(names),
    'code_group': lambda grp: 'integration' in list(grp)
}).rename(columns={'name': 'has_softkey', 'code_group': 'has_integration'})

softandint = len(intKeyGrp[(intKeyGrp['has_softkey'] == True) & (intKeyGrp['has_integration'] == True)])
inttotal = len(intKeyGrp[intKeyGrp['has_integration'] == True])

print('Analysis that used integration and a softkey: {a} / {b}'.format(**{
    'a': softandint,
    'b': inttotal
}))

Analysis that used integration and a softkey: 4 / 34


In [10]:
intVsKeygenGrp = intKeyAnalysis.groupby('analysis').agg({
    'code_group': lambda x: len(x) == 2
})

print('{0:.2f}% of analyses that used integration also performed key generation.'.format((intVsKeygenGrp['code_group'].sum() / len(intVsKeygenGrp['code_group'])) * 100))

25.71% of analyses that used integration also performed key generation.
