In [1]:
import sys
import numpy as np

import bisect
import copy
import logging
import math
import random
import re

from collections import Counter
from operator import itemgetter
import numpy as np
import pandas
import scipy
from scipy import stats
from scipy import special
from scipy import ndimage

from scipy.special import gamma

from io import open

In [2]:
def ReadFemPreg(dct_file='data/2002FemPreg.dct',
                dat_file='data/2002FemPreg.dat.gz'):
    """Reads the NSFG pregnancy data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = ReadStataDct(dct_file)   
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
    CleanFemPreg(df)
    return df

def ReadStataDct(dct_file, **options):
    """Reads a Stata dictionary file.

    dct_file: string filename
    options: dict of options passed to open()

    returns: FixedWidthVariables object
    """
    type_map = dict(byte=int, int=int, long=int, float=float, 
                    double=float, numeric=float)

    var_info = []
    with open(dct_file, **options) as f:
        for line in f:
            match = re.search( r'_column\(([^)]*)\)', line)
            if not match:
                continue
            start = int(match.group(1))
            t = line.split()
            vtype, name, fstring = t[1:4]
            name = name.lower()
            if vtype.startswith('str'):
                vtype = str
            else:
                vtype = type_map[vtype]
            long_desc = ' '.join(t[4:]).strip('"')
            var_info.append((start, vtype, name, fstring, long_desc))
            
    columns = ['start', 'type', 'name', 'fstring', 'desc']
    variables = pandas.DataFrame(var_info, columns=columns)

    # fill in the end column by shifting the start column
    variables['end'] = variables.start.shift(-1)
    variables.loc[len(variables)-1, 'end'] = -1

    dct = FixedWidthVariables(variables, index_base=1)
    return dct

def CleanFemPreg(df):
    """Recodes variables from the pregnancy frame.

    df: DataFrame
    """
    # mother's age is encoded in centiyears; convert to years
    df.agepreg /= 100.0

    # birthwgt_lb contains at least one bogus value (51 lbs)
    # replace with NaN
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
    
    # replace 'not ascertained', 'refused', 'don't know' with NaN
    na_vals = [97, 98, 99]
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
    df.hpagelb.replace(na_vals, np.nan, inplace=True)

    df.babysex.replace([7, 9], np.nan, inplace=True)
    df.nbrnaliv.replace([9], np.nan, inplace=True)

    # birthweight is stored in two columns, lbs and oz.
    # convert to a single column in lb
    # NOTE: creating a new column requires dictionary syntax,
    # not attribute assignment (like df.totalwgt_lb)
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    

    # due to a bug in ReadStataDct, the last variable gets clipped;
    # so for now set it to NaN
    df.cmintvw = np.nan
    
class FixedWidthVariables(object):
    """Represents a set of variables in a fixed width file."""

    def __init__(self, variables, index_base=0):
        """Initializes.

        variables: DataFrame
        index_base: are the indices 0 or 1 based?

        Attributes:
        colspecs: list of (start, end) index tuples
        names: list of string variable names
        """
        self.variables = variables

        # note: by default, subtract 1 from colspecs
        self.colspecs = variables[['start', 'end']] - index_base

        # convert colspecs to a list of pair of int
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
        self.names = variables['name']

    def ReadFixedWidth(self, filename, **options):
        """Reads a fixed width ASCII file.

        filename: string filename

        returns: DataFrame
        """
        df = pandas.read_fwf(filename,
                             colspecs=self.colspecs, 
                             names=self.names,
                             **options)
        return df

In [3]:
df = ReadFemPreg()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.colspecs = self.colspecs.astype(np.int).values.tolist()


In [4]:
df

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.8750
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,9.1250
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,7.0000
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,6.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13588,12571,1,,,,,6.0,,1.0,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,6.1875
13589,12571,2,,,,,3.0,,,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,
13590,12571,3,,,,,3.0,,,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,
13591,12571,4,,,,,6.0,,1.0,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,7.5000


In [5]:
df.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

In [6]:
df['caseid']

0            1
1            1
2            2
3            2
4            2
         ...  
13588    12571
13589    12571
13590    12571
13591    12571
13592    12571
Name: caseid, Length: 13593, dtype: int64

In [11]:
df.caseid

0            1
1            1
2            2
3            2
4            2
         ...  
13588    12571
13589    12571
13590    12571
13591    12571
13592    12571
Name: caseid, Length: 13593, dtype: int64

In [12]:
type(df.caseid)

pandas.core.series.Series

In [8]:
preorder = df.pregordr

In [9]:
preorder[0:10]

0    1
1    2
2    1
3    2
4    3
5    1
6    2
7    3
8    1
9    2
Name: pregordr, dtype: int64

In [10]:
preorder.head(10)

0    1
1    2
2    1
3    2
4    3
5    1
6    2
7    3
8    1
9    2
Name: pregordr, dtype: int64

In [20]:
preorder.tail()

13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, dtype: int64

In [21]:
df.agepreg

0        33.16
1        39.25
2        14.33
3        17.83
4        18.33
         ...  
13588    17.91
13589    18.50
13590    19.75
13591    21.58
13592    21.58
Name: agepreg, Length: 13593, dtype: float64

In [11]:
df.agepreg

0        33.16
1        39.25
2        14.33
3        17.83
4        18.33
         ...  
13588    17.91
13589    18.50
13590    19.75
13591    21.58
13592    21.58
Name: agepreg, Length: 13593, dtype: float64

In [12]:
df.agepreg /=100

In [13]:
df.agepreg

0        0.3316
1        0.3925
2        0.1433
3        0.1783
4        0.1833
          ...  
13588    0.1791
13589    0.1850
13590    0.1975
13591    0.2158
13592    0.2158
Name: agepreg, Length: 13593, dtype: float64

In [29]:
check_for_nan = df.agepreg.isnull().values.any()

In [30]:
check_for_nan

True

In [35]:
df.birthwgt_oz

0        13.0
1        14.0
2         2.0
3         0.0
4         3.0
         ... 
13588     3.0
13589     NaN
13590     NaN
13591     8.0
13592     8.0
Name: birthwgt_oz, Length: 13593, dtype: float64

In [38]:
df.birthwgt_lb.

True

In [41]:
df.birthwgt_lb

4509

In [51]:
df[df['agepreg'].isnull()].agepreg.tail()

13425   NaN
13440   NaN
13441   NaN
13530   NaN
13577   NaN
Name: agepreg, dtype: float64

In [53]:
birthwgt_lb = df.birthwgt_lb

In [54]:
birthwgt_lb

0        8.0
1        7.0
2        9.0
3        7.0
4        6.0
        ... 
13588    6.0
13589    NaN
13590    NaN
13591    7.0
13592    7.0
Name: birthwgt_lb, Length: 13593, dtype: float64

In [63]:
df[birthwgt_lb.isin([8])]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
5,6,1,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.5625
7,6,3,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.3750
19,21,1,,,,,6.0,,1.0,,...,0,0,0,3408.342437,3965.763949,7237.122630,1,48,,8.7500
20,21,2,,,,,6.0,,1.0,,...,0,0,0,3408.342437,3965.763949,7237.122630,1,48,,8.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13539,12528,2,,,,,6.0,,1.0,,...,0,0,0,3410.699028,3801.513533,6937.382047,2,38,,8.5625
13540,12528,3,,,,,6.0,,1.0,,...,0,0,0,3410.699028,3801.513533,6937.382047,2,38,,8.8750
13546,12535,3,,,,,5.0,,1.0,,...,0,0,0,3612.891741,3860.578307,7045.169353,2,73,,8.9375
13562,12547,1,,,,,6.0,,1.0,,...,0,0,0,3453.545517,6628.022524,11499.619080,1,52,,8.1250
