# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [3]:
from __future__ import print_function, division

import nsfg

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [2]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [3]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [4]:
preg.columns[1]

'pregordr'

Select a column and check what type it is.

In [5]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [6]:
pregordr

0        1
1        2
2        1
3        2
4        3
        ..
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

Select a single element from a column.

In [7]:
pregordr[0]

1

Select a slice from a column.

In [8]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [9]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [10]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [11]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [12]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [14]:
# Solution goes here
preg['birthord'].value_counts().sort_index()

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [15]:
preg.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [39]:
# agg pregnancies duration of completed pregnacies 13 weeks or less
mask = preg['prglngth']<= 13
lt_13 = preg.loc[mask].count()
# agg pregnancies duration of completed pregnacies less than 14 and 26 weeks
mask = (preg['prglngth'] > 13) & (preg['prglngth'] <= 26)
bt_14_26 = preg.loc[mask].count()
# agg pregnancies duration of completed pregnacies 27 wees or longer
mask = preg['prglngth']>26
gt_26 = preg.loc[mask].count()

print(' less than 13 weeks:',lt_13[0], '\n between 14 and 16', bt_14_26[0],
     '\n 27 weeks or longer', gt_26[0])


 less than 13 weeks: 3522 
 between 14 and 16 793 
 27 weeks or longer 9278


To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [40]:
preg.totalwgt_lb.mean()

7.265628457623368

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [48]:
# Solution goes here
preg['totalwgt_kg'] = preg['totalwgt_lb']*0.4535924
preg['totalwgt_kg'].mean()

3.2956338496017112

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [5]:
resp = nsfg.ReadFemResp()


Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,totincr_i,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,intvlngth
count,7643.0,7643.0,7643.0,7643.0,7643.0,7636.0,7643.0,7643.0,7643.0,7643.0,...,7643.0,7643.0,7643.0,7643.0,7643.0,7643.0,7643.0,7643.0,7643.0,7643.0
mean,6272.107811,2.589952,4.901086,3.210258,4.207641,4.54374,29.501374,29.503729,872.764098,29.503729,...,0.082428,0.020411,4165.454807,5285.022176,8054.52241,1.48986,44.574905,1232.7832,1220.7832,82.755558
std,3638.824134,1.957642,0.621227,1.453237,1.594345,0.915728,8.431868,8.433187,101.161818,8.433187,...,0.275034,0.14141,3210.203026,4506.081386,7342.734883,0.49993,24.239897,2.978545,2.978545,32.618328
min,1.0,1.0,1.0,1.0,1.0,1.0,15.0,15.0,688.0,15.0,...,0.0,0.0,64.577101,71.201194,118.65679,1.0,1.0,1225.0,1213.0,13.512667
25%,3105.5,1.0,5.0,2.0,5.0,4.0,22.0,22.0,787.0,22.0,...,0.0,0.0,2418.149898,2900.050198,4175.880599,1.0,25.0,1231.0,1219.0,60.527833
50%,6275.0,1.0,5.0,3.0,5.0,5.0,30.0,30.0,873.0,30.0,...,0.0,0.0,3410.500996,4212.06952,6553.169964,1.0,45.0,1233.0,1221.0,78.180833
75%,9442.0,5.0,5.0,4.0,5.0,5.0,37.0,37.0,959.5,37.0,...,0.0,0.0,4870.774756,6033.357685,9590.027246,2.0,65.0,1235.0,1223.0,99.281
max,12571.0,5.0,5.0,8.0,5.0,5.0,44.0,44.0,1058.0,44.0,...,1.0,1.0,99707.832014,157143.686687,261879.953864,2.0,84.0,1239.0,1227.0,306.238


`DataFrame` provides a method `head` that displays the first five rows:

AssertionError: 

Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [54]:
# Solution goes here
# youngest is 15, oldest is 44
resp['age_r'].value_counts().sort_index()

15    217
16    223
17    234
18    235
19    241
20    258
21    267
22    287
23    282
24    269
25    267
26    260
27    255
28    252
29    262
30    292
31    278
32    273
33    257
34    255
35    262
36    266
37    271
38    256
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, dtype: int64

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [20]:
resp[resp.caseid==2298]


Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.799490,4744.191350,2,18,1233,1221,16:30:59,64.294000
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.799490,4744.191350,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7638,11018,1,5,2,5,3.0,34,34,811,34,...,0,3247.445399,3784.333145,6565.818007,2,76,1228,1216,15:57:38,82.907333
7639,6075,5,5,3,5,3.0,17,17,1014,17,...,0,2273.211779,2497.234491,4392.385746,2,76,1228,1216,18:23:53,54.044833
7640,5649,1,5,2,5,5.0,29,29,873,29,...,0,3247.445399,3569.313710,6003.228729,2,76,1228,1216,18:42:41,68.168000
7641,501,5,5,3,5,2.0,16,16,1034,16,...,0,5304.160818,5954.644352,10473.623950,2,76,1228,1216,16:02:45,32.717333


And we can get the corresponding rows from `preg` like this:

In [56]:
preg[preg.caseid==2298]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118448
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5,2.494758
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875,1.899418
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118448


How old is the respondent with `caseid` 1?

In [19]:
# Solution goes here
resp[resp['caseid']==2298]['pregnum']

0    4
Name: pregnum, dtype: int64

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [60]:
# Solution goes here
preg[preg['caseid']==2298]['prglngth']

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [65]:
# Solution goes here
preg[preg['caseid']==5012]['totalwgt_lb']


5515    6.0
Name: totalwgt_lb, dtype: float64

In [1]:
def ReadFemResp( dct_file='2002FemResp.dct',
                dat_file='2002FemResp.dat.gz'):
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
    return df