In [15]:
"""This file contains code to read respondent file,
2002FemResp.dat.gz
Author: Saurabh Biswas
"""

from __future__ import print_function, division

import sys
import numpy as np
import thinkstats2
import nsfg

from collections import defaultdict


def ReadFemResp(dct_file='2002FemResp.dct',
                dat_file='2002FemResp.dat.gz',
                nrows=None):
    """Reads the NSFG respondent data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
    CleanFemResp(df)
    return df


def CleanFemResp(df):
    """Recodes variables from the respondent frame.

    df: DataFrame
    """
    pass



def ValidatePregnum(resp_df):
    """Validate pregnum in the respondent file.

    resp: respondent DataFrame
    """
    # read pregnancy file
    preg_df = nsfg.ReadFemPreg()
    
    # make the map from caseid to list of pregnancy indices
    preg_map = nsfg.MakePregMap(preg_df)
    
    # iterate through the respondent pregnum series
    for index, pregnum in resp_df.pregnum.iteritems():
        caseid = resp_df.caseid[index]
        indices = preg_map[caseid]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            return False

    return True


def main():
    """Tests the functions in this module.

    script: string script name
    """
    # read and validate the respondent file
    resp_df = ReadFemResp()

    assert(len(resp_df) == 7643)
    assert(resp_df.pregnum.value_counts().max() == 2610)
    assert(resp_df.pregnum.value_counts().min() == 1)
    print(resp_df.pregnum.value_counts().sort_index())

    # validate that the pregnum column in `resp_df` matches the number
    # of entries in `preg_df`
    assert(ValidatePregnum(resp_df))

    
    print('All tests passed.')


if __name__ == '__main__':
    main()


0     2610
1     1267
2     1432
3     1110
4      611
5      305
6      150
7       80
8       40
9       21
10       9
11       3
12       2
14       2
19       1
Name: pregnum, dtype: int64
All tests passed.
