## Title: Exercise 1.2 Read Fixed width file using dictionary
## Sanjay Jaras

### Import Libraries

In [2]:
import pandas
import numpy as np

### Define function to read dictionary file and return as pandas data frame

In [3]:
def readDictionaryFile(fileName):
    """Reads a Stata dictionary file.

    fileName: string filename
    
    returns: Pandas Dataframe with Variable information with column names start, type, name, fstring, desc
    """
    # String types to python type mapping
    type_map = dict(byte=int,
                    int=int,
                    long=int,
                    float=float,
                    double=float,
                    numeric=float)
    # list holding variable information
    var_info = []
    # open file
    with open(fileName) as f:
        # iterate all lines inside file
        for line in f:
            # if line not mathcing the column pattern ignore
            if not "_column" in line:
                continue

            #split line with spaces
            splits = line.split()
            # get the integer starting offset
            start = int(splits[0].replace("_column(", "").replace(")", ""))
            # get varaible type, name of column and format string for column
            vtype, name, fstring = splits[1:4]
            # if variable type is like str12 or str23 get only str as type
            if vtype.startswith('str'):
                vtype = str
            else:
                vtype = type_map[vtype]

            # form the column description
            desc = ' '.join(splits[4:]).strip('"')
            # append the column info to list
            var_info.append((start, vtype, name, fstring, desc))
    # List of column names
    columns = ['start', 'type', 'name', 'fstring', 'desc']
    # create pandas dataframe with colunm names and column information.
    variablesDf = pandas.DataFrame(var_info, columns=columns)
    # fill in the end column by shifting the start column
    variablesDf['end'] = variablesDf.start.shift(-1)
    # add 0 as a end for last variable end
    variablesDf.loc[len(variablesDf) - 1, 'end'] = 0

    # return dataframe with Variable information
    return variablesDf

### Read Fixed width file using dictionary data into pandas dataframe

In [4]:
variableInfoDf = readDictionaryFile('2002FemResp.dct')
# change column'start and end to 0 based index by subtracting 1
colspecs = variableInfoDf[['start', 'end']] - 1
# convert start and end pair to list
colspecs = colspecs.astype(np.int).values.tolist()
# read fixed width by using column info we read from dictionary file
df = pandas.read_fwf("2002FemResp.dat.gz",
                     colspecs=colspecs,
                     names=variableInfoDf.name)
# print number of records read from file
print("Number of records read:", len(df))

Number of records read: 7643


### Show the pregnancy number counts in sorted order.

In [5]:
df.pregnum.value_counts().sort_index()

0     2610
1     1267
2     1432
3     1110
4      611
5      305
6      150
7       80
8       40
9       21
10       9
11       3
12       2
14       2
19       1
Name: pregnum, dtype: int64