# DataFrames
The result of ReadFixedWidth is a DataFrame

In [17]:
import pandas as pd
import numpy as np

In [32]:
import nsfg
df = nsfg.ReadFemPreg()
print(df)

       caseid  pregordr  howpreg_n  howpreg_p  moscurrp  nowprgdk  pregend1  \
0           1         1        NaN        NaN       NaN       NaN       6.0   
1           1         2        NaN        NaN       NaN       NaN       6.0   
2           2         1        NaN        NaN       NaN       NaN       5.0   
3           2         2        NaN        NaN       NaN       NaN       6.0   
4           2         3        NaN        NaN       NaN       NaN       6.0   
5           6         1        NaN        NaN       NaN       NaN       6.0   
6           6         2        NaN        NaN       NaN       NaN       6.0   
7           6         3        NaN        NaN       NaN       NaN       6.0   
8           7         1        NaN        NaN       NaN       NaN       5.0   
9           7         2        NaN        NaN       NaN       NaN       5.0   
10         12         1        NaN        NaN       NaN       NaN       5.0   
11         14         1        NaN        NaN       

In [5]:
df.columns
# Returns an Index data structure

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

In [6]:
df.columns[1]

'pregordr'

In [14]:
# Can user column name as key to access column in DataFrame
pregordr = df['pregordr']
type(pregordr)
# Returns a series

pandas.core.series.Series

In [8]:
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

In [11]:
# Can access series using indices and slices
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

In [13]:
# Can also use dot notation to access columns.
# This only works if the column name is a valid Python identifier. 
# ... begins with a letter; no spaces; etc.
pregordr = df.pregordr

## Transformation

Special values are dangerous and can provide bogus results (see cell below).
If we change the values to nan, most pandas functions will handle it appropriately.

import numpy as np

np.nan / 100.0

nan

In [35]:
# Transformation
# Transform special codes (97, 98, 99) into nan, so they won't be counted as 97-99 lb babies.

nsfg.CleanFemPreg(df)
df.birthwgt_lb

0        8.0
1        7.0
2        9.0
3        7.0
4        6.0
5        8.0
6        9.0
7        8.0
8        7.0
9        6.0
10       7.0
11       7.0
12       4.0
13       NaN
14       NaN
15       7.0
16       7.0
17       6.0
18       NaN
19       8.0
20       8.0
21       5.0
22       NaN
23       6.0
24       7.0
25       6.0
26       8.0
27       7.0
28       6.0
29       7.0
        ... 
13563    7.0
13564    7.0
13565    8.0
13566    7.0
13567    NaN
13568    NaN
13569    5.0
13570    6.0
13571    6.0
13572    5.0
13573    6.0
13574    6.0
13575    NaN
13576    6.0
13577    NaN
13578    6.0
13579    7.0
13580    NaN
13581    6.0
13582    NaN
13583    NaN
13584    6.0
13585    NaN
13586    NaN
13587    NaN
13588    6.0
13589    NaN
13590    NaN
13591    7.0
13592    7.0
Name: birthwgt_lb, Length: 13593, dtype: float64

## Validation

Can validate data by computing basic statistics and comparing them with known results.

In [36]:
# Count of values (for birth outcomes)
df.outcome.value_counts(sort=False)

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

In [37]:
# Count of values (for birth weights)
df.birthwgt_lb.value_counts(sort=False)

8.0     1889
7.0     3049
6.0     2223
4.0      229
5.0      697
10.0     132
12.0      10
14.0       3
3.0       98
1.0       40
2.0       53
0.0        8
9.0      623
11.0      26
13.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

In the example (not shown here), we have an entry for:
51 -> 1

Thats a 51 lb baby! Or a data error.
To deal with this, we added a line to CleanFemPreg (see below)
Note: instead of 51 lbs, we used 15 lbs as an example

In [45]:
# Using a .loc statement. 
# The first expression is the row indexer.
# The second expression selects the column

df.loc[df.birthwgt_lb >= 15, 'birthwgt_lb']

7308    15.0
Name: birthwgt_lb, dtype: float64

Series([], Name: birthwgt_lb, dtype: float64)