# Chapter 1

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_rows",500)

In [71]:
df = pd.read_csv("/home/rohit/Desktop/study/ai-projects/national_survey_of_family_growth/dataset/2002FemalePregnancy.csv")
df.shape

(13593, 243)

There are 244 variables in total but only the following variables are used: 

- **caseid** is the integer ID of the respondent
- **prglngth** is the integer duration of the pregnancy in weeks.
- **outcome** is an integer code for the outcome of the pregnancy. The code 1 indicates a live birth.
- **pregordr** is a pregnancy serial number; for example, the code for a respondent’s first pregnancy is 1, for the second pregnancy is 2, and so on.
- **birthord** is a serial number for live births; the code for a respondent’s first child is 1, and so on. For outcomes other than live birth, this field is blank.
- **birthwgt_lb** and **birthwgt_oz** contain the pounds and ounces parts of the birth weight of the baby.
- **agepreg** is the mother’s age at the end of the pregnancy.
- **finalwgt** is the statistical weight associated with the respondent. It is a floating-point value that indicates the number of people in the U.S. population this respondent represents.

In addition it uses several special codes:
- 97 NOT ASCERTAINED
- 98 REFUSED
- 99 DON'T KNOW

In [72]:
# Data Cleaning

# mother's age is encoded in centiyears; convert to years
df.agepreg /= 100.0

# birthwgt_lb contains at least one bogus value (51 lbs)
# replace with NaN
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan

# replace 'not ascertained', 'refused', 'don't know' with NaN
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)

df.babysex.replace([7, 9], np.nan, inplace=True)
df.nbrnaliv.replace([9], np.nan, inplace=True)

# birthweight is stored in two columns, lbs and oz.
# convert to a single column in lb
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    

# due to a bug in ReadStataDct, the last variable gets clipped;
# so for now set it to NaN
df.cmintvw = np.nan

print(df.shape)

(13593, 244)


In [77]:
df.to_csv('./dataset/2002FemalePregnancy_clean.csv', index=False)

In [74]:
'''
value label Total
1 LIVE BIRTH 9148
2 INDUCED ABORTION 1862
3 STILLBIRTH 120
4 MISCARRIAGE 1921
5 ECTOPIC PREGNANCY 190
6 CURRENT PREGNANCY 352
'''
# Validate the cleaning is done accurately
df.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

In [75]:
'''
value label Total
. INAPPLICABLE 4449
0-5 UNDER 6 POUNDS 1125
6 6 POUNDS 2223
7 7 POUNDS 3049
8 8 POUNDS 1889
9-95 9 POUNDS OR MORE 799
'''
bins = [0,5,6,7,8,9,99]
pd.cut(df.birthwgt_lb,bins).value_counts(dropna=False).sort_index()


(0.0, 5.0]     1117
(5.0, 6.0]     2223
(6.0, 7.0]     3049
(7.0, 8.0]     1889
(8.0, 9.0]      623
(9.0, 99.0]     175
NaN            4517
Name: birthwgt_lb, dtype: int64

In [81]:
from collections import defaultdict
def MakePregMap(df):
    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return d

In [82]:
caseid = 10229
preg_map = MakePregMap(df)
indices = preg_map[caseid]
df.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1])

# Chapter 2