In [31]:
import pandas as pd
import numpy as np

# Subsetting the data to features of interest

## Load the full dataset

In [170]:
# Replace the path with your path to the data, or drop the csv to the data_source directory
original_full_df = pd.read_csv('data_source/nuMoM2b_Dataset_NICHD Data Challenge.csv')
original_full_df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(9289, 11717)

## Create a dataframe with only pregnancy outcome data

In [171]:
Preg_Outcomes = pd.DataFrame()
Preg_Outcomes['PublicID'] = original_full_df.PublicID
Preg_Outcomes["Stroke/Cerebrovascular_Accident"] = original_full_df.VXXB01ar_FA
Preg_Outcomes["Blood_Transfusion"] = (original_full_df.CMAE11==1)*1.0
Preg_Outcomes["Blood_Transfusion"][Preg_Outcomes.Blood_Transfusion.isna()] = original_full_df.CMAE11[original_full_df.CMAE11.isna()]
Preg_Outcomes[ "Myocardial_Infarction/Heart_Attack"] = original_full_df.VXXB01aj_FA
Preg_Outcomes["Renal_Failure"] = original_full_df.VXXB01al_FA
Preg_Outcomes["Eclampsia"] = (original_full_df.PEgHTN==1)*1.0
Preg_Outcomes["Maternal_Sepsis"] = (original_full_df.CMEA02==1)*1.0
Preg_Outcomes["GDM"] = (original_full_df.oDM==2)*1.0
Preg_Outcomes["Pre-DM"] = (original_full_df.oDM==1)*1.0
Preg_Outcomes["Pre_Eclampsia"] = ((original_full_df.PEgHTN==2) |(original_full_df.PEgHTN==3)| (original_full_df.PEgHTN==4) )*1.0
Preg_Outcomes["Hypertension"] = ((original_full_df.PEgHTN==5) |(original_full_df.PEgHTN==6))*1.0
Preg_Outcomes["Pre-term_Birth"] = (original_full_df.TYPE_CA_A09=="iPTLB")*1.0
Preg_Outcomes["Still Birth"] = (original_full_df.A09A02==2)*1.0                                 
Preg_Outcomes.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Preg_Outcomes["Blood_Transfusion"][Preg_Outcomes.Blood_Transfusion.isna()] = original_full_df.CMAE11[original_full_df.CMAE11.isna()]


(9289, 13)

## Writes it to data_source directory to use it later on

In [172]:
Preg_Outcomes.to_csv('data_source/preg_outcomes.csv',index=False)

## Create a dataframe with baseline features that we think we be useful for GDM

In [173]:
def parse_to_float(x):
    try: 
        return np.float64(x)
    except ValueError:
        return np.nan
    
BL = pd.DataFrame()

# Participant IDs
BL['PublicID'] = original_full_df.PublicID

# Age and Age category
BL["Age"] = original_full_df.Age_at_V1
BL["AgeCat"] = original_full_df.AgeCat_V1

# BMI and BMI category
BL["BMI_Cat"] = original_full_df.BMI_Cat
BL["BMI"] = original_full_df.BMI

# Self-reported Race
BL["Race"] = original_full_df.Race

# Family History (first-degree relative) of diabetes
BL["Diabetes_History"] = (original_full_df["V2AE04"].apply(parse_to_float))

# Previous diagbosis of PCOS
BL["PCOS"]= (original_full_df.VXXB01bc3_FA.apply(parse_to_float))

# Gravidity
BL["Gravidity"] = original_full_df.GravCat

# Weight
mean_weight = np.nanmean([original_full_df["WEIGHT"].apply(parse_to_float), original_full_df["V1BA01_LB"].apply(parse_to_float), (2.20462* original_full_df["V1BA01_KG"].apply(parse_to_float))], axis = 0)
BL["Weight"] = mean_weight

# Blood Pressure
BP_syst = np.nanmean([original_full_df["V1BA06a1"].apply(parse_to_float), original_full_df["V1BA06a2"].apply(parse_to_float)], axis = 0)
BP_dias = np.nanmean([original_full_df["V1BA06b1"].apply(parse_to_float), original_full_df["V1BA06b2"].apply(parse_to_float)], axis = 0)
BL["BP_Systolic"] = BP_syst
BL["BP_Diastolic"] = BP_dias

### The following features are added on 10/13/21

# Exercise (Using Visit 1 METs as measure)
Agg_METs = 1.0*(original_full_df.Visit == 1) * original_full_df.TimesPerWeek.apply(parse_to_float) * original_full_df.MinsPerTime.apply(parse_to_float)*original_full_df.METs.apply(parse_to_float)
Agg_METs_1 = 1.0*(original_full_df.Visit_1 == 1) * original_full_df.TimesPerWeek_1.apply(parse_to_float) * original_full_df.MinsPerTime_1.apply(parse_to_float)*original_full_df.METs_1.apply(parse_to_float)
Agg_METs_2 = 1.0*(original_full_df.Visit_2 == 1) * original_full_df.TimesPerWeek_2.apply(parse_to_float) * original_full_df.MinsPerTime_2.apply(parse_to_float)*original_full_df.METs_2.apply(parse_to_float)
Total_METs = Agg_METs + Agg_METs_1 + Agg_METs_2
BL["Total_METs"] = Total_METs

# Smoking Status and counts
Smoking_status = original_full_df["V1AG05"].apply(parse_to_float)# Did you smoke any tobacco products in the three months prior to this pregnancy?
Smoking_counts = original_full_df["V1AG05a"].apply(parse_to_float) # How many cigarettes did you smoke per day in the three months prior to this pregnancy? - # per day
BL['Smoking_Status'] = Smoking_status
BL['Smoking_Counts'] = Smoking_counts

# Drinking Status
Drinking_status = original_full_df["V1AG02"].apply(parse_to_float) #Did you drink any alcoholic beverages in the three months prior to this pregnancy?

BL['Drinking_Status'] = Drinking_status # Some oddities exist, need to process later

# Drugs use (Ever)
# Marijuana, Cocaine, Prescription narcotics not prescribed for you, Heroin, Methadone, Amphetamines, Inhalants, Hallucinogens
# Drugs_in_preg_any = (original_full_df.V1AG12a == 1) | (original_full_df.V1AG12b ==1) | (original_full_df.V1AG12c ==1) | (original_full_df.V1AG12d == 1) | (original_full_df.V1AG12d1 == 1) | (original_full_df.V1AG12d2 == 1) | (original_full_df.V1AG12d3 == 1) | (original_full_df.V1AG12d4 == 1) | (original_full_df.V1AG12e == 1) | (original_full_df.V1AG12f == 1) | (original_full_df.V1AG12g == 1)| (original_full_df.V1AG12h == 1) |  (original_full_df.V1AG12i == 1)

# Drug use (During pregnancy)
# Any drug use around time of pregnancy
Drugs_in_pregnancy_V2 = (original_full_df.V2AH07a ==1) | (original_full_df.V2AH07b ==1) | (original_full_df.V2AH07c ==1) | (original_full_df.V2AH07d ==1) | (original_full_df.V2AH07e==1) | (original_full_df.V2AH07f==1) | (original_full_df.V2AH07g==1)| (original_full_df.V2AH07h==1) | (original_full_df.V2AH07i==1)
BL['Drug_use_near_pregnancy'] = Drugs_in_pregnancy_V2

#Poverty
#1	> 200% of fed poverty level
#2	100-200% of fed poverty level
#3	< 100% of fed poverty level
Poverty_level = original_full_df["poverty"] # Poverty category based on income (V1AF14) and household size (V1AF13) relative to 2013 federal poverty guidelines
BL['Poverty_level'] = Poverty_level

#Education
#Education_v1	1	Less than HS grad
#Education_v1	2	HS grad or GED
#Education_v1	3	Some college
#Education_v1	4	Assoc/Tech degree
#Education_v1	5	Completed college
#Education_v1	6	Degree work beyond college
BL['Education_level'] = original_full_df["Education"]

#Nutrition
Nutrion_columns = ["AHEI_VEGS", "AHEI_FRUITS", "AHEI_WGRAINS", "AHEI_SUGBEVS", "AHEI_NUTLEGS", "AHEI_RMEATS","AHEI_TRFATPCT", "AHEI_DHAEPA", "AHEI_PUFAPCT", "AHEI_SODIUM", "AHEI_ALCDRKS", "AHEI2010"]
BL[Nutrion_columns] = original_full_df[Nutrion_columns]

BL.shape

  mean_weight = np.nanmean([original_full_df["WEIGHT"].apply(parse_to_float), original_full_df["V1BA01_LB"].apply(parse_to_float), (2.20462* original_full_df["V1BA01_KG"].apply(parse_to_float))], axis = 0)
  BP_syst = np.nanmean([original_full_df["V1BA06a1"].apply(parse_to_float), original_full_df["V1BA06a2"].apply(parse_to_float)], axis = 0)
  BP_dias = np.nanmean([original_full_df["V1BA06b1"].apply(parse_to_float), original_full_df["V1BA06b2"].apply(parse_to_float)], axis = 0)


(9289, 31)

## Writes it to data_source directory to use it later on

In [174]:
BL.to_csv("data_source/baseline_data_2021_10_13.csv",index=False)

# Inspect the two dataframe

In [175]:
og_baseline_data = pd.read_csv("data_source/baseline_data_2021_10_13.csv")
og_baseline_data.shape

(9289, 31)

In [176]:
og_outcome_data = pd.read_csv("data_source/preg_outcomes.csv")
og_outcome_data.shape

(9289, 13)

# Merge outcome with baseline data

In [177]:
big_df = pd.merge(og_baseline_data,og_outcome_data,on='PublicID')
big_df.shape


(9289, 43)

# Preprocessing - Age
Missing age are put into category 0

In [178]:
big_df.loc[big_df['AgeCat'].isna(),'AgeCat'] = 0
big_df['AgeCat'].value_counts()

2.0    8186
3.0     730
1.0     226
4.0     137
0.0      10
Name: AgeCat, dtype: int64

## Preprocessing - BMI
Missing BMI are put into category 0

In [179]:
big_df.loc[big_df['BMI_Cat'].isna(),'BMI_Cat'] = 0
big_df['BMI_Cat'].value_counts()

2.0    4585
3.0    2266
4.0    1083
5.0     933
1.0     212
0.0     210
Name: BMI_Cat, dtype: int64

# Preprocessing - Diabetes_History

Missing diabetes History - > 0

No first degree relative has diabetes -> 1

Any first degree relative has diabetes -> 2

In [180]:
big_df['Diabetes_History'].value_counts()

2.0    6708
1.0    1918
Name: Diabetes_History, dtype: int64

In [181]:
big_df['Diabetes_History_Cat'] = 0
big_df.loc[big_df['Diabetes_History'] == 2,'Diabetes_History_Cat'] = 1
big_df.loc[big_df['Diabetes_History'] == 1,'Diabetes_History_Cat'] = 2
big_df['Diabetes_History_Cat'].value_counts()

1    6708
2    1918
0     663
Name: Diabetes_History_Cat, dtype: int64

## Preprocessing - PCOS
Missing PCOS -> 0 (None)

No personal history of PCOS -> 1

Previously diagnosed with PCOS -> 2

In [182]:
big_df['PCOS_Cat'] = 0
big_df.loc[big_df['PCOS'] == 0,'PCOS_Cat'] = 1
big_df.loc[big_df['PCOS'] == 1,'PCOS_Cat'] = 1
big_df['PCOS_Cat'].value_counts()

1    9019
0     270
Name: PCOS_Cat, dtype: int64

# Preprocessing - Gravidity
Missing Gravidity are put into category 0

Gravidity of 1 is put into category 1

Gravidity of more than 1 is put into category 2

In [183]:
big_df['Gravidity_Cat'] = 0
big_df.loc[big_df['Gravidity'] == 1,'Gravidity_Cat'] = 1
big_df.loc[big_df['Gravidity'] > 1,'Gravidity_Cat'] = 2
big_df['Gravidity_Cat'].value_counts()

1    6882
2    2397
0      10
Name: Gravidity_Cat, dtype: int64

# Preprocessing - Weight
Missing weight is put into category 0
Non-missing weight is put into category 1

In [184]:
big_df['Weight_Cat'] = 1
big_df.loc[big_df['Weight'].isna(),'Weight_Cat'] = 0
big_df['Weight_Cat'].value_counts()

1    9224
0      65
Name: Weight_Cat, dtype: int64

# Preprocessing - HiBP
Using https://www.health.harvard.edu/heart-health/reading-the-new-blood-pressure-guidelines as guideline
One is considered HiBP if BP fits criteria Hypertension 1, that is

Systolic at least 130 OR Diastolic at least 80

Missing BP is category 0

Normal BP is category 1

High BP is category 2

In [185]:
big_df['High_BP_Cat'] = 1
big_df.loc[(big_df['BP_Systolic'] >= 130)|(big_df['BP_Diastolic']>=80),'High_BP_Cat'] = 2
big_df.loc[(big_df['BP_Systolic'].isna())|big_df['BP_Diastolic'].isna(),'High_BP_Cat'] = 0
big_df['High_BP_Cat'].value_counts()

1    8026
2    1061
0     202
Name: High_BP_Cat, dtype: int64

# Preprocessing - Exercise

The US federal guideline recommend at least 150 minutes per week of moderate-intensity physical exercise (or 450 METs)

Participants who have missing METs data are put in category 0

Participants who have below 450 METs are categorized as physically inactive (cateogry 1)

Participants who have above or equal 450 METs are categorized as physically active (category 2)


In [186]:
big_df['Exercise_Cat'] = 0
big_df.loc[(big_df['Total_METs'] < 450),'Exercise_Cat'] = 1
big_df.loc[(big_df['Total_METs'] >= 450),'Exercise_Cat'] = 2
big_df['Exercise_Cat'].value_counts()

0    4324
2    3391
1    1574
Name: Exercise_Cat, dtype: int64

# Preprocessing - Smoking

Participants who have missing smoking data are put in category 0 (None is put into this category)

Participants who did not smoke within 3 months of pregnancy are put in cateogry 1

Participants who did smoke within 3 months of pregnancy are put in cateogry 2

In [187]:
big_df['Smoking_Cat'] = 0
big_df.loc[(big_df['Smoking_Status'])==2,'Smoking_Cat'] = 1
big_df.loc[(big_df['Smoking_Status'])==1,'Smoking_Cat'] = 2
big_df['Smoking_Cat'].value_counts()

0    5418
1    2200
2    1671
Name: Smoking_Cat, dtype: int64

# Preprocessing - Drinking

Participants who have missing drinking data are put in category 0

Participants who did not consume alcoholic drink within 3 months of pregnancy are put in cateogry 1

Participants who did consume alcoholic drink within 3 months of pregnancy are put in cateogry 2

In [188]:
big_df['Drinking_Cat'] = 0
big_df.loc[(big_df['Drinking_Status']==1),'Drinking_Cat'] = 1
big_df.loc[(big_df['Drinking_Status']==2),'Drinking_Cat'] = 2
big_df['Drinking_Cat'].value_counts()

1    5983
2    1985
0    1321
Name: Drinking_Cat, dtype: int64

# Preprocessing - Drug Use
Was any drugs used around the time of pregnancy (1 month prior to visit 2) ?

Participants who have missing drug use data are put in category 0 (None)

Participants who did not use any drugs are put in cateogry 1

Participants who did use drugs are put in cateogry 2

In [189]:
big_df['Drug_Cat'] = 0
big_df.loc[(~big_df['Drug_use_near_pregnancy']),'Drug_Cat'] = 1
big_df.loc[(big_df['Drug_use_near_pregnancy']),'Drug_Cat'] = 2
big_df['Drug_Cat'].value_counts()

1    9187
2     102
Name: Drug_Cat, dtype: int64

# Preprocessing - Poverty Level

Participants who have missing data are put in category 0

Participants who are below federal poverty level are category 1

Participants who are above federal poverty level are cateogry 2

In [190]:
big_df['Poverty_Cat'] = 0
big_df.loc[(big_df['Poverty_level'] == 3),'Poverty_Cat'] = 1
big_df.loc[(big_df['Poverty_level'] == 1)|(big_df['Poverty_level'] == 2),'Poverty_Cat'] = 2
big_df['Poverty_Cat'].value_counts()

2    6321
0    1751
1    1217
Name: Poverty_Cat, dtype: int64

# Preprocessing - Education_level
Participants who have missing Education data are put in category 0

Participants who did not obtain any form of college degree are put in category 1

Participants who obtained any form of college degree are put in category 2

In [191]:
big_df['Education_Cat'] = 0
big_df.loc[(big_df['Education_level'] < 3),'Education_Cat'] = 1
big_df.loc[(big_df['Education_level'] >= 3),'Education_Cat'] = 2
big_df['Education_Cat'].value_counts()

2    7421
1    1850
0      18
Name: Education_Cat, dtype: int64

# Preprocessing -Outcome
Missing outcomes are out into category 0 (treated as the outcome did not happen)

In [192]:
outcome_columns = og_outcome_data.columns
outcome_columns

Index(['PublicID', 'Stroke/Cerebrovascular_Accident', 'Blood_Transfusion',
       'Myocardial_Infarction/Heart_Attack', 'Renal_Failure', 'Eclampsia',
       'Maternal_Sepsis', 'GDM', 'Pre-DM', 'Pre_Eclampsia', 'Hypertension',
       'Pre-term_Birth', 'Still Birth'],
      dtype='object')

In [193]:
big_df.loc[:,outcome_columns] = big_df.loc[:,outcome_columns].fillna(0)
big_df.shape

(9289, 54)

# Preprocessing - Exclude Pre-DM

Participants whose are already diagnosed with diabetes are excluded

In [194]:
big_df = big_df[big_df['Pre-DM'] == 0].copy()
big_df.shape

(9151, 54)

# Preprocessing Done

In [195]:
big_df.to_csv('data_source/baseline_data_2021_10_13_AR_Ready.csv',index=False)