In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math



In [2]:
#define FixedWidthVariables class which reads & maps Stata dictionary files into pandas

class FixedWidthVariables(object):

    def __init__(self, variables, index_base = 0):
        self.variables = variables
        self.colspecs = variables[['start','end']] - index_base
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
        self.names = variables['name']

    def ReadFixedWidth(self, filename, **options):
        df = pd.read_fwf(filename, colspecs = self.colspecs, names = self.names, **options)

        return df


In [3]:
#define StatData to parse through Stata dct file and extract variable names 

def StatData(dct_file, **options):
    types = dict(byte=int, int=int, long=int, float=int, double=float)

    var_info = []
    for line in open(dct_file, **options):
        row = re.search(r'_column\(([^)]*)\)', line)
        if row:
            start = int(row.group(1))
            t = line.split()
            vtype, name, fstring = t[1:4]
            name = name.lower()
            if vtype.startswith('str'):
                vtype = str
            else:
                vtype = types[vtype]
            long_desc = ' '.join(t[4:]).strip('"')
            var_info.append((start, vtype, name, fstring, long_desc))

    columns = ['start', 'type','name','fstring','desc']
    variables = pd.DataFrame(var_info, columns = columns)

    variables['end'] = variables.start.shift(-1)
    variables.loc[len(variables)-1, 'end'] = 0
    dct = FixedWidthVariables(variables, index_base = 1)
    return dct


In [4]:
labels = '2002FemResp.dct'
datafile = '2002FemResp.dat.gz'


dct = StatData(labels)

raw_data = dct.ReadFixedWidth(datafile, compression = 'gzip')

kids_in_hh = raw_data.numkdhh.as_matrix()
biased_kids_in_hh = []

for i in kids_in_hh:
    if i==0:
        continue
    else:
        j = 0
        while j < i:
            biased_kids_in_hh.append(i)
            j+=1
    
kihh = pd.Series(kids_in_hh)
bkihh = pd.Series(biased_kids_in_hh)



In [6]:
fig = plt.figure()
ax = fig.add_subplot(111)


pmf = kihh.value_counts().sort_index()/len(kihh)
print pmf
pmf.plot(kind = 'bar', facecolor = 'None', edgecolor = 'r', position = 0, subplots = True, sharex = True)
plt.show()
biased_pmf = bkihh.value_counts().sort_index()/len(bkihh)
print biased_pmf
biased_pmf.plot(kind = 'bar', facecolor = 'None', edgecolor = 'b', position = 0, subplots = True, sharex = True)
plt.show()


print "unbiased mean", kihh.mean()
print "biased mean", bkihh.mean()

0    0.466178
1    0.214052
2    0.196258
3    0.087139
4    0.025644
5    0.010729
dtype: float64
1    0.208993
2    0.383240
3    0.255238
4    0.100153
5    0.052376
dtype: float64
unbiased mean 1.02420515504
biased mean 2.40367910066
