# Load Data

In [2091]:
import pandas as pd
from datetime import datetime
from datetime import date
import numpy as np

HOME_PATH = '/Users/shaynaanderson-hill/medic_mobile/20190402-Data Scientist Interview Exercises Datasets/'

#Load Datasets
assessment_follow_up_df = pd.read_csv(
    HOME_PATH + 'assessment_follow_up.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

assessments_df = pd.read_csv(
    HOME_PATH + 'assessments.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

delivery_df = pd.read_csv(
    HOME_PATH + 'delivery.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

family_survey_df = pd.read_csv(
    HOME_PATH + 'family_survey.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

person_df = pd.read_csv(
    HOME_PATH + 'person.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

pregnancy_df = pd.read_csv(
    HOME_PATH + 'pregnancy.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)

pregnancy_visit_df = pd.read_csv(
    HOME_PATH + 'pregnancy_visit.csv',
    header=0, 
    dtype={'width': int, 'column_name': str, 'variable_type': str}
)


# Rename Columns 

In [2092]:
#Add table prefixes to avoid confusion
#person_df = person_df.add_prefix('person_')
#family_survey_df = family_survey_df.add_prefix('family_survey_')
assessments_df = assessments_df.add_prefix('assessment_')
assessment_follow_up_df = assessment_follow_up_df.add_prefix('assessment_follow_up_')
pregnancy_df = pregnancy_df.add_prefix('pregnancy_')
pregnancy_visit_df = pregnancy_visit_df.add_prefix('pregnancy_visit_')
delivery_df = delivery_df.add_prefix('delivery_')


In [2093]:
%%capture
#Remove redundant prefixes

assessments_df.rename(index=str, columns={'assessment_assessment_id': 'assessment_id', 'assessment_assessment_date_time_submitted': 'assessment_date_time_submitted'}, inplace=True) 
assessment_follow_up_df.rename(index=str, columns={'assessment_follow_up_assessment_follow_up_id': 'assessment_follow_up_id', 'assessment_follow_up_assessment_follow_up_date_time_submitted': 'assessment_follow_up_date_time_submitted'}, inplace=True) 
pregnancy_df.rename(index=str, columns={'pregnancy_pregnancy_form_id': 'pregnancy_form_id'}, inplace=True)
pregnancy_visit_df.rename(index=str, columns={'pregnancy_visit_pregnancy_visit_form_id': 'pregnancy_visit_form_id'}, inplace=True) 
delivery_df.rename(index=str, columns={'delivery_delivery_id': 'delivery_id', 'delivery_delivery_date_time_submitted': 'delivery_date_time_submitted', 'delivery_facility_delivery': 'facility_delivery', 'delivery_danger_signs_at_delivery': 'danger_signs_at_delivery'}, inplace=True) 



# Converting Data Types

In [2094]:
#Date of birth

#Fix broken dates
#Assumption: date_of_birth = '1007-10-08' is supposed to be '2007-10-08'
person_df.loc[person_df.date_of_birth == '1007-10-08', 'date_of_birth'] = '2007-10-08'

#Assumption: date_of_birth = '1563-11-04' is supposed to be '1963-11-04'
person_df.loc[person_df.date_of_birth == '1563-11-04', 'date_of_birth'] = '1963-11-04'

#Assumption: date_of_birth = '1263-12-05 is supposed to be '1963-12-05'
person_df.loc[person_df.date_of_birth == '1263-12-05', 'date_of_birth'] = '1963-12-05'

#convert date strings to date objects
person_df['date_of_birth'] = pd.to_datetime(person_df['date_of_birth'])


In [2095]:
#Last menstrual period
pregnancy_df['pregnancy_last_menstrual_period_date'] = pd.to_datetime(pregnancy_df['pregnancy_last_menstrual_period_date'])



In [2096]:
#Expected due date
pregnancy_df['pregnancy_expected_due_date'] = pd.to_datetime(pregnancy_df['pregnancy_expected_due_date'])          


In [2097]:
#Pregnancy registration submission
#First remove timezone
def remove_tz(date_str):
    return str(date_str[:(date_str.find("T"))])

pregnancy_df['pregnancy_date_time_submitted'] = pregnancy_df['pregnancy_date_time_submitted'].apply(remove_tz)

pregnancy_df['pregnancy_date_time_submitted'] = pd.to_datetime(pregnancy_df['pregnancy_date_time_submitted'])


In [2098]:
#Pregnancy visit submission
pregnancy_visit_df['pregnancy_visit_date_time_submitted'] = pregnancy_visit_df['pregnancy_visit_date_time_submitted'].apply(remove_tz)

pregnancy_visit_df['pregnancy_visit_date_time_submitted'] = pd.to_datetime(pregnancy_visit_df['pregnancy_visit_date_time_submitted'])


In [2099]:
#Delivery submission
#Proxy for delivery date
delivery_df['delivery_date_time_submitted'] = pd.to_datetime(delivery_df['delivery_date_time_submitted'])        


# Parse Data

In [2100]:
#placeholder new data frame with split value columns 
new = pregnancy_df["pregnancy_risk_factors"].str.split(" ", n=6, expand = True) 

new["r1"] = ""
new["r2"] = ""
new["r3"] = ""
new["r4"] = ""
new["r5"] = ""
new["r6"] = ""

for index, row in new.iterrows():
    if row[0] == 'r1' or row[1] == 'r1' or row[2] == 'r1' or row[3] == 'r1' or row[4] == 'r1' or row[5]=='r1':
        row['r1'] = True
    else: row['r1'] = False
    if row[0] == 'r2' or row[1] == 'r2' or row[2] == 'r2' or row[3] == 'r2' or row[4] == 'r2' or row[5]=='r2':
        row['r2'] = True
    else: row['r2'] = False
    if row[0] == 'r3' or row[1] == 'r3' or row[2] == 'r3' or row[3] == 'r3' or row[4] == 'r3' or row[5]=='r3':
        row['r3'] = True
    else: row['r3'] = False
    if row[0] == 'r4' or row[1] == 'r4' or row[2] == 'r4' or row[3] == 'r4' or row[4] == 'r4' or row[5]=='r4':
        row['r4'] = True
    else: row['r4'] = False
    if row[0] == 'r5' or row[1] == 'r5' or row[2] == 'r5' or row[3] == 'r5' or row[4] == 'r5' or row[5]=='r5':
        row['r5'] = True
    else: row['r5'] = False
    if row[0] == 'r6' or row[1] == 'r6' or row[2] == 'r6' or row[3] == 'r6' or row[4] == 'r6' or row[5]=='r6':
        row['r6'] = True
    else: row['r6'] = False

# making seperate risk factor columns from new data frame 
pregnancy_df["pregnancy_risk_factor_1"]= new['r1'] 
pregnancy_df["pregnancy_risk_factor_2"]= new['r2'] 
pregnancy_df["pregnancy_risk_factor_3"]= new['r3'] 
pregnancy_df["pregnancy_risk_factor_4"]= new['r4'] 
pregnancy_df["pregnancy_risk_factor_5"]= new['r5']  
pregnancy_df["pregnancy_risk_factor_6"]= new['r6'] 
  
# Dropping old Name columns 
#pregnancy_df.drop(columns =["pregnancy_risk_factors"], inplace = True) 
  
# df display 
#pregnancy_df

In [2101]:
#placeholder new data frame with split value columns 
new = pregnancy_df["pregnancy_danger_signs"].str.split(" ", n=9, expand = True) 

new["d1"] = ""
new["d2"] = ""
new["d3"] = ""
new["d4"] = ""
new["d5"] = ""
new["d6"] = ""
new["d7"] = ""
new["d8"] = ""
new["d9"] = ""

for index, row in new.iterrows():
    if row[0] == 'd1' or row[1] == 'd1' or row[2] == 'd1' or row[3] == 'd1' or row[4] == 'd1' or row[5]=='d1':
        row['d1'] = True
    else: row['d1'] = False
    if row[0] == 'd2' or row[1] == 'd2' or row[2] == 'd2' or row[3] == 'd2' or row[4] == 'd2' or row[5]=='d2':
        row['d2'] = True
    else: row['d2'] = False
    if row[0] == 'd3' or row[1] == 'd3' or row[2] == 'd3' or row[3] == 'd3' or row[4] == 'd3' or row[5]=='d3':
        row['d3'] = True
    else: row['d3'] = False
    if row[0] == 'd4' or row[1] == 'd4' or row[2] == 'd4' or row[3] == 'd4' or row[4] == 'd4' or row[5]=='d4':
        row['d4'] = True
    else: row['d4'] = False
    if row[0] == 'd5' or row[1] == 'd5' or row[2] == 'd5' or row[3] == 'd5' or row[4] == 'd5' or row[5]=='d5':
        row['d5'] = True
    else: row['d5'] = False
    if row[0] == 'd6' or row[1] == 'd6' or row[2] == 'd6' or row[3] == 'd6' or row[4] == 'd6' or row[5]=='d6':
        row['d6'] = True
    else: row['d6'] = False
    if row[0] == 'd7' or row[1] == 'd7' or row[2] == 'd7' or row[3] == 'd7' or row[4] == 'd7' or row[5]=='d7':
        row['d7'] = True
    else: row['d7'] = False
    if row[0] == 'd8' or row[1] == 'd8' or row[2] == 'd8' or row[3] == 'd8' or row[4] == 'd8' or row[5]=='d8':
        row['d8'] = True
    else: row['d8'] = False
    if row[0] == 'd9' or row[1] == 'd9' or row[2] == 'd9' or row[3] == 'd9' or row[4] == 'd9' or row[5]=='d9':
        row['d9'] = True
    else: row['d9'] = False

# making seperate danger signs from new data frame 
pregnancy_df["pregnancy_danger_signs_1"]= new['d1'] 
pregnancy_df["pregnancy_danger_signs_2"]= new['d2'] 
pregnancy_df["pregnancy_danger_signs_3"]= new['d3'] 
pregnancy_df["pregnancy_danger_signs_4"]= new['d4'] 
pregnancy_df["pregnancy_danger_signs_5"]= new['d5']  
pregnancy_df["pregnancy_danger_signs_6"]= new['d6']  
pregnancy_df["pregnancy_danger_signs_7"]= new['d7']  
pregnancy_df["pregnancy_danger_signs_8"]= new['d8'] 
pregnancy_df["pregnancy_danger_signs_9"]= new['d9'] 

# Dropping old Name columns 
#pregnancy_df.drop(columns =["pregnancy_danger_signs"], inplace = True) 
  
# df display 
#pregnancy_df

In [2102]:
#placeholder new data frame with split value columns 
new = pregnancy_visit_df["pregnancy_visit_danger_signs"].str.split(" ", n=9, expand = True) 

new["d1"] = ""
new["d2"] = ""
new["d3"] = ""
new["d4"] = ""
new["d5"] = ""
new["d6"] = ""
new["d7"] = ""
new["d8"] = ""
new["d9"] = ""

for index, row in new.iterrows():
    if row[0] == 'd1' or row[1] == 'd1' or row[2] == 'd1' or row[3] == 'd1' or row[4] == 'd1' or row[5]=='d1':
        row['d1'] = True
    else: row['d1'] = False
    if row[0] == 'd2' or row[1] == 'd2' or row[2] == 'd2' or row[3] == 'd2' or row[4] == 'd2' or row[5]=='d2':
        row['d2'] = True
    else: row['d2'] = False
    if row[0] == 'd3' or row[1] == 'd3' or row[2] == 'd3' or row[3] == 'd3' or row[4] == 'd3' or row[5]=='d3':
        row['d3'] = True
    else: row['d3'] = False
    if row[0] == 'd4' or row[1] == 'd4' or row[2] == 'd4' or row[3] == 'd4' or row[4] == 'd4' or row[5]=='d4':
        row['d4'] = True
    else: row['d4'] = False
    if row[0] == 'd5' or row[1] == 'd5' or row[2] == 'd5' or row[3] == 'd5' or row[4] == 'd5' or row[5]=='d5':
        row['d5'] = True
    else: row['d5'] = False
    if row[0] == 'd6' or row[1] == 'd6' or row[2] == 'd6' or row[3] == 'd6' or row[4] == 'd6' or row[5]=='d6':
        row['d6'] = True
    else: row['d6'] = False
    if row[0] == 'd7' or row[1] == 'd7' or row[2] == 'd7' or row[3] == 'd7' or row[4] == 'd7' or row[5]=='d7':
        row['d7'] = True
    else: row['d7'] = False
    if row[0] == 'd8' or row[1] == 'd8' or row[2] == 'd8' or row[3] == 'd8' or row[4] == 'd8' or row[5]=='d8':
        row['d8'] = True
    else: row['d8'] = False
    if row[0] == 'd9' or row[1] == 'd9' or row[2] == 'd9' or row[3] == 'd9' or row[4] == 'd9' or row[5]=='d9':
        row['d9'] = True
    else: row['d9'] = False

# making seperate danger signs from new data frame 
pregnancy_visit_df["pregnancy_visit_danger_signs_1"]= new['d1'] 
pregnancy_visit_df["pregnancy_visit_danger_signs_2"]= new['d2'] 
pregnancy_visit_df["pregnancy_visit_danger_signs_3"]= new['d3'] 
pregnancy_visit_df["pregnancy_visit_danger_signs_4"]= new['d4'] 
pregnancy_visit_df["pregnancy_visit_danger_signs_5"]= new['d5']  
pregnancy_visit_df["pregnancy_visit_danger_signs_6"]= new['d6']  
pregnancy_visit_df["pregnancy_visit_danger_signs_7"]= new['d7']  
pregnancy_visit_df["pregnancy_visit_danger_signs_8"]= new['d8'] 
pregnancy_visit_df["pregnancy_visit_danger_signs_9"]= new['d9'] 

# Dropping old Name columns 
#pregnancy_visit_df.drop(columns =["pregnancy_visit_danger_signs"], inplace = True) 
  
# df display 
#pregnancy_visit_df

# Missing Data

In [2103]:
#person_df
person_df.describe(include = 'all')
# 12 people have missing households
# 4,440 people are missing sex data
# 1,050 are missing their date of birth


Unnamed: 0,person_id,sex,date_of_birth,hh_id
count,909991,905551,908941,909979
unique,909991,5,31345,227445
top,a60b88a232ae2fe440659af9c12ef775,female,2014-08-28 00:00:00,d4906d1f2ebe160656766b5f1ecdca98
freq,1,495600,260,75
first,,,1684-12-08 00:00:00,
last,,,2066-02-06 00:00:00,


12 people have missing households,
4,440 people are missing sex data,
1,050 are missing their date of birth

In [2104]:
#family_survey_df
family_survey_df.describe(include = 'all')

Unnamed: 0,hh_id,how_water_treated,has_mosquito_net,latrine,hand_washing_facilities,electricity,television,cupboard,dvd,radio,clock,floor,walls,roof,fuel,toilet,highest_education_achieved,wealth_quintile
count,207086,143080,207086,207086,207086,207086,207086,207086,207086,207086,207086,207086,207086,207086,207086,207086,203210,207081.0
unique,207086,4,2,2,2,2,2,2,2,2,2,3,2,2,3,2,6,
top,a3af1e697af3a7bc04ec53be12d6f5e3,aqua_tabs,yes,yes,yes,no,no,no,no,yes,no,earth_sand,dung_mud_sod,roof_other,wood,toilet_other,some_primary,
freq,1,99043,182664,192964,103649,158335,156987,104129,175019,160636,172086,119968,129404,167786,180156,195529,71878,
mean,,,,,,,,,,,,,,,,,,2.323593
std,,,,,,,,,,,,,,,,,,1.32243
min,,,,,,,,,,,,,,,,,,1.0
25%,,,,,,,,,,,,,,,,,,1.0
50%,,,,,,,,,,,,,,,,,,2.0
75%,,,,,,,,,,,,,,,,,,3.0


We have missing data for 'highest_education_achieved' and 'how_water_treated'.

In [2105]:
#pregnancy_df
pregnancy_df.describe(include = 'all')


Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,pregnancy_risk_factor_6,pregnancy_danger_signs_1,pregnancy_danger_signs_2,pregnancy_danger_signs_3,pregnancy_danger_signs_4,pregnancy_danger_signs_5,pregnancy_danger_signs_6,pregnancy_danger_signs_7,pregnancy_danger_signs_8,pregnancy_danger_signs_9
count,49245,49245,49245,49245,49245,46636.0,49245,49245,6129,9709,...,49245,49245,49245,49245,49245,49245,49245,49245,49245,49245
unique,49245,2121,1150,46840,45145,,1251,1249,223,63,...,2,2,2,2,2,2,2,2,2,2
top,b2462d463e7e243642c1ba22e712a0f9,ab07d474a2de074098e74e6f0aab529f,2017-03-08 00:00:00,edf62a43a415eb5ef1ffb75f43146b16,3aa3a19d166ab44e905ab1e86e068b34,,2018-02-28 00:00:00,2018-12-04 00:00:00,d1,r1,...,False,False,False,False,False,False,False,False,False,False
freq,1,244,161,10,13,,172,171,1817,4465,...,48854,46488,48778,48227,48907,47709,48757,49129,48907,49119
first,,,2009-01-01 00:00:00,,,,2008-05-02 00:00:00,2009-02-05 00:00:00,,,...,,,,,,,,,,
last,,,2019-04-01 00:00:00,,,,2019-03-19 00:00:00,2019-12-23 00:00:00,,,...,,,,,,,,,,
mean,,,,,,25.05247,,,,,...,,,,,,,,,,
std,,,,,,6.484784,,,,,...,,,,,,,,,,
min,,,,,,-1.0,,,,,...,,,,,,,,,,
25%,,,,,,20.0,,,,,...,,,,,,,,,,


We have missing data for pregnancy_patient_age_in_years. Also for pregnancy_danger_signs and pregnancy_risk factors, but these will be dropped later in place of the dummy variables.

In [2106]:
#pregnancy_visit_df
pregnancy_visit_df.describe(include = 'all')

Unnamed: 0,pregnancy_visit_form_id,pregnancy_visit_chw_id,pregnancy_visit_date_time_submitted,pregnancy_visit_patient_id,pregnancy_visit_hh_id,pregnancy_visit_how_visit_conducted,pregnancy_visit_referred_to_health_facility,pregnancy_visit_danger_signs,pregnancy_visit_danger_signs_1,pregnancy_visit_danger_signs_2,pregnancy_visit_danger_signs_3,pregnancy_visit_danger_signs_4,pregnancy_visit_danger_signs_5,pregnancy_visit_danger_signs_6,pregnancy_visit_danger_signs_7,pregnancy_visit_danger_signs_8,pregnancy_visit_danger_signs_9
count,138638,138638,138638,138638,138638,138638,138638,4536,138638,138638,138638,138638,138638,138638,138638,138638,138638
unique,138638,2082,919,39803,38510,2,2,124,2,2,2,2,2,2,2,2,2
top,0eab25a18c5e2509cce6e6d40c0eb78c,ab07d474a2de074098e74e6f0aab529f,2018-10-17 00:00:00,d3e4e26e3b579617aad151038f5857b7,abdc224a6508f1d83dce037224626a8a,in_person,False,d1,False,False,False,False,False,False,False,False,False
freq,1,950,330,28,43,120656,134102,1390,136795,138184,138099,138291,137635,138072,138232,138406,138487
first,,,2009-01-01 00:00:00,,,,,,,,,,,,,,
last,,,2019-05-10 00:00:00,,,,,,,,,,,,,,


Missing pregnancy_visit_danger_signs has been converted to dummy variables and will be dropped later. A missing value for a danger sign means no danger sign was recorded.

In [2107]:
#delivery_df
delivery_df.describe(include = 'all')


Unnamed: 0,delivery_id,delivery_chw_id,delivery_date_time_submitted,delivery_patient_id,delivery_hh_id,facility_delivery,danger_signs_at_delivery,delivery_first_visit_on_time
count,32465,32465,32465,32465,32465,32465,24110,32465
unique,32465,2039,32454,32465,31120,2,2,2
top,fc527bbec73383221e645be1233d33fb,ab07d474a2de074098e74e6f0aab529f,2018-09-07 11:28:57+00:00,21b1203b03806f3b2f11e8cb4f558004,abdc224a6508f1d83dce037224626a8a,True,False,True
freq,1,125,2,1,6,28200,23531,30635
first,,,2009-01-07 05:40:31+00:00,,,,,
last,,,2019-04-01 18:21:21+00:00,,,,,


We have missing values for danger_signs_at_delivery which is a potential outcome variable. Could it be the case that missing danger signs at delivery means there were not danger signs at delivery? Can we follow up with someone in the field? Otherwise, we need to investigate whether or not missing danger_signs_at_delivery is random. It is plausible to imagine that missing danger signs at delivery is caused by understaffing, if this is the case, then it is not independent. Removing observations with missing danger_signs_at_delivery would bias our data and reduce the accuracy of future predictions.

In [2108]:
#Investigate possible missingness function - 
#Is danger signs at delivery missingness predicted by facility_delivery?
missing_delivery_danger = delivery_df[delivery_df['danger_signs_at_delivery'].isnull()]

#Initial investigation of missingness mechanism
missing_delivery_danger[missing_delivery_danger['facility_delivery'] == False].count()
#missing delivery danger signs does not seem to be a function of whether or not the delivery happened at a healthcare facilityf


delivery_id                     1104
delivery_chw_id                 1104
delivery_date_time_submitted    1104
delivery_patient_id             1104
delivery_hh_id                  1104
facility_delivery               1104
danger_signs_at_delivery           0
delivery_first_visit_on_time    1104
dtype: int64

Missing delivery danger signs does not seem to be a function of whether or not the delivery happened at a healthcare facility.


# Handle Missing Data

I am of the camp that missing data is rarely missing at random. For this reason, to either impute the mean/median or drop missing observations, introduces bias. Missingness is informative and thus my strategy for handling missing data is to create missing flags. For categorical data, this means adding a category "missing" and for numerical data this means, imputing a '0' or other value and creating a missing flag.

In [2109]:
family_survey_df['how_water_treated'].fillna('missing', inplace=True)

In [2110]:
family_survey_df['highest_education_achieved'].fillna('missing', inplace=True)

In [2111]:
pregnancy_df['pregnancy_patient_age_in_years'].unique()

array([16., 27., 32., 28., 25., 23., 22., 31., 26., 33., 17., 20., 30.,
       38., 19., 18., 36., 21., 24., 29., 34., 37., 35., 42., 44., 13.,
       39., 40., 14., 41., 11., 48.,  0., 15., 49., 47., 10., 45., nan,
       43., 46., 12., 50., 63., -1., 51.,  3., 52.])

In [2112]:
pregnancy_df[pregnancy_df['pregnancy_patient_age_in_years']== -1]

Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,pregnancy_risk_factor_6,pregnancy_danger_signs_1,pregnancy_danger_signs_2,pregnancy_danger_signs_3,pregnancy_danger_signs_4,pregnancy_danger_signs_5,pregnancy_danger_signs_6,pregnancy_danger_signs_7,pregnancy_danger_signs_8,pregnancy_danger_signs_9
23375,a513486f0aae056a371ad76a8980054f,e5189d6f0d372c3a977c1657352b1e75,2017-12-12,85d8066eb38549e78610e037b8477ee5,340fcc00a79c5effcafaa0e0f7c2e833,-1.0,2017-12-01,2018-09-06,,,...,False,False,False,False,False,False,False,False,False,False


In my opinion, a patient age of '-1' is a missing value.

In [2113]:
pregnancy_df.loc[pregnancy_df.pregnancy_patient_id == '85d8066eb38549e78610e037b8477ee5', 'pregnancy_patient_age_in_years'] = np.nan

In [2114]:
pregnancy_df['pregnancy_patient_age_in_years'].unique()

array([16., 27., 32., 28., 25., 23., 22., 31., 26., 33., 17., 20., 30.,
       38., 19., 18., 36., 21., 24., 29., 34., 37., 35., 42., 44., 13.,
       39., 40., 14., 41., 11., 48.,  0., 15., 49., 47., 10., 45., nan,
       43., 46., 12., 50., 63., 51.,  3., 52.])

In [2115]:
#Impute average patient_age_in_years for missing data
#pregnancy_df['pregnancy_patient_age_in_years'].fillna((pregnancy_df['pregnancy_patient_age_in_years'].mean()), inplace=True)

#Impute zeros for missing data
pregnancy_df['pregnancy_patient_age_in_years'].fillna(0, inplace=True)



In [2116]:
#Create dummy missingness variable
pregnancy_df['age_missing'] = pregnancy_df['pregnancy_patient_age_in_years'].isnull()

pregnancy_df.head()


Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,pregnancy_danger_signs_1,pregnancy_danger_signs_2,pregnancy_danger_signs_3,pregnancy_danger_signs_4,pregnancy_danger_signs_5,pregnancy_danger_signs_6,pregnancy_danger_signs_7,pregnancy_danger_signs_8,pregnancy_danger_signs_9,age_missing
0,fb4419f66aec81237d8f87c4ff0054a3,7f2d25199eef813addb4d9f783b3e075,2018-02-06,97a5f299f3254a998a3bca0ad11a154d,11d7a4f38fafa7954ce0752a1343e168,16.0,2017-08-07,2018-05-13,,,...,False,False,False,False,False,False,False,False,False,False
1,60cee26693654bcdee37558c3efe79f8,82b888a09920a91a193b60fa68605243,2018-02-07,9769f2af62e82eb5bc661520a03739e5,2d3975a66c2fd5ec4653d151c025ba17,27.0,2017-07-29,2018-05-04,,,...,False,False,False,False,False,False,False,False,False,False
2,49776d9ba5b39fd209880c29cf30f10b,44d0a969dea0b117cf8eee9c671da1e1,2018-02-07,669a826b66e0615d3d5be61ea3c789f1,8085e6eee84c56353fcb12a324e4f0ec,32.0,2017-12-21,2018-09-26,,,...,False,False,False,False,False,False,False,False,False,False
3,73fbadf1c54c15e4d733e80899bd5a60,0296ee306ec54e31cdd661859e4a32ef,2018-02-07,34b484a85a48ed5441c5b09de6867306,0f01fd8cf6d363411725243842b8315d,28.0,2017-08-08,2018-05-14,,,...,False,False,False,False,False,False,False,False,False,False
4,46f237f3890d2e9e37cb6b0e7fb0edc3,a3abfd66e17a0e3a1303dcb3bfb937d1,2018-02-07,c697f1476734d3b8782c6288a2aaf478,d4d73db5620f13202fdb7edd0c5c21c9,25.0,2017-12-08,2018-09-13,,,...,False,False,False,False,False,False,False,False,False,False


In [2117]:
#Check
pregnancy_df.describe(include = 'all')


Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,pregnancy_danger_signs_1,pregnancy_danger_signs_2,pregnancy_danger_signs_3,pregnancy_danger_signs_4,pregnancy_danger_signs_5,pregnancy_danger_signs_6,pregnancy_danger_signs_7,pregnancy_danger_signs_8,pregnancy_danger_signs_9,age_missing
count,49245,49245,49245,49245,49245,49245.0,49245,49245,6129,9709,...,49245,49245,49245,49245,49245,49245,49245,49245,49245,49245
unique,49245,2121,1150,46840,45145,,1251,1249,223,63,...,2,2,2,2,2,2,2,2,2,1
top,b2462d463e7e243642c1ba22e712a0f9,ab07d474a2de074098e74e6f0aab529f,2017-03-08 00:00:00,edf62a43a415eb5ef1ffb75f43146b16,3aa3a19d166ab44e905ab1e86e068b34,,2018-02-28 00:00:00,2018-12-04 00:00:00,d1,r1,...,False,False,False,False,False,False,False,False,False,False
freq,1,244,161,10,13,,172,171,1817,4465,...,46488,48778,48227,48907,47709,48757,49129,48907,49119,49245
first,,,2009-01-01 00:00:00,,,,2008-05-02 00:00:00,2009-02-05 00:00:00,,,...,,,,,,,,,,
last,,,2019-04-01 00:00:00,,,,2019-03-19 00:00:00,2019-12-23 00:00:00,,,...,,,,,,,,,,
mean,,,,,,23.725211,,,,,...,,,,,,,,,,
std,,,,,,8.444765,,,,,...,,,,,,,,,,
min,,,,,,0.0,,,,,...,,,,,,,,,,
25%,,,,,,20.0,,,,,...,,,,,,,,,,


# Explore Data

In [None]:
#demographics

household_size = person_df.groupby('hh_id')['person_id'].nunique().reset_index(name="household_size")
household_size.max()
household_size.mean()
household_size.median()
hist = household_size.hist(bins=70)
hist


In [None]:
print('We have ' + str(person_df['person_id'].nunique()) + 'people in our dataset')

In [None]:
print('We have ' + str(person_df['hh_id'].nunique()) + 'households in our dataset')

In [None]:
#family survey

print('We have ' + str(family_survey_df['hh_id'].nunique()) + 'household survey data')


In [None]:
family_survey_df[family_survey_df['has_mosquito_net'] == 'yes'].count()


In [None]:
family_survey_df[family_survey_df['hand_washing_facilities'] == 'yes'].count()


In [None]:
family_survey_df[family_survey_df['latrine'] == 'yes'].count()


In [None]:
family_survey_df[family_survey_df['electricity'] == 'yes'].count()

In [None]:
family_survey_df[family_survey_df['television'] == 'yes'].count()


In [None]:
#Count registered pregnancies

print('There are ' + str(pregnancy_df['pregnancy_form_id'].nunique()) + ' registered pregnancies in our dataset')


In [None]:
# Count pregnant women

print('There are ' + str(pregnancy_df['pregnancy_patient_id'].nunique()) + ' women with registered pregnancies in our dataset')



Since there is no pregnancy_id, there is a bit of difficulty involved in joining the pregnancy registration data 
to the delivery data. We can use the patient_id, but since one patient can have multiple pregnancies and multiple
deliveries, we do not know for sure that a given pregnancy registration corresponds to a given delivery.

There are ways to help mediate this, by trying to link a pregnancy registration with a delivery by using a timeframe
around the approximated due date, but this is not foolproof


In [None]:
#To see how much this could affect our analysis, let's see for how many women, have multiple deliveries
print(str(delivery_df.groupby('delivery_patient_id')['delivery_id'].nunique().max()) + ' max deliveries per patient')

#For pregnancy registrations?
print(str(pregnancy_df.groupby('pregnancy_patient_id')['pregnancy_form_id'].nunique().max()) + ' max pregnancy registrations per pateient')



Interesting... there is only one delivery per patient, but a patient can have 10 pregnancy registrations


In [None]:
pregnancy_counts = pregnancy_df.groupby('pregnancy_patient_id')['pregnancy_form_id'].nunique().reset_index(name="count")

hist = pregnancy_counts.hist(bins=9)
hist



44675 of the 46840 patients only have one pregnancy (95%)
 

One time-saving option is to limit our analysis to only those patients who have a single pregnancy registration, 
but this may introduce bias. This also doesn't guarantee the that the delivery outcome is the one associated with 
the pregnancy registration.


In [None]:
#Count pregnancy visit patients

pregnancy_visit_df['pregnancy_visit_patient_id'].nunique()


In [None]:
#Count pregnancy visits

pregnancy_visit_df['pregnancy_visit_form_id'].nunique()


In [None]:
#Create pregnancy reg pregnancy visit df
pregnancy_reg_pregnancy_visit_df = pd.merge(pregnancy_df, pregnancy_visit_df,  how='inner', left_on=['pregnancy_patient_id','pregnancy_hh_id'], right_on = ['pregnancy_visit_patient_id','pregnancy_visit_hh_id'])

pregnancy_reg_pregnancy_visit_df.head()

mutual_patients = pregnancy_reg_pregnancy_visit_df[pregnancy_reg_pregnancy_visit_df['pregnancy_patient_id'] == pregnancy_reg_pregnancy_visit_df['pregnancy_patient_id']]
mutual_patients['pregnancy_patient_id'].nunique()
#mutual_patients['pregnancy_visit_patient_id'].nunique()

#

In [None]:
#Again, without a pregnancy_id, we are unable to reliably match pregnancy visits to a pregnancy or a delivery, 
#if the patient has more than one pregnancy
pregnancy_visit_counts = pregnancy_visit_df.groupby('pregnancy_visit_patient_id')['pregnancy_visit_form_id'].nunique().reset_index(name="visit_count")

print('Maximum number of pregnancy visits for a single patient is ' + str(pregnancy_visit_counts['visit_count'].max()))
print('The average number of pregnancy visits for a single patient is ' + str(pregnancy_visit_counts['visit_count'].mean()))
print('The median number of pregnancy visits for a single patient is ' + str(pregnancy_visit_counts['visit_count'].median()))
hist = pregnancy_visit_counts.hist(bins=28)
hist



In [None]:
#How many patients had 3 or more pregnancy visits
more_visits = pd.DataFrame(pregnancy_visit_counts[pregnancy_visit_counts['visit_count'] >= 3])

more_visits['pregnancy_visit_patient_id'].nunique()


In [None]:
#Deliveries

delivery_df['delivery_patient_id'].nunique()
delivery_df['delivery_id'].nunique()


In [None]:
#How many patients have delivery data who do not have registered pregnancy data

missing_reg = set(delivery_df['delivery_patient_id']) - set(pregnancy_df['pregnancy_patient_id'])
len(missing_reg)


In [None]:
#How many patients had their delivery at a health facility

facility_deliveries = delivery_df[delivery_df['facility_delivery']==True]
facility_deliveries['delivery_patient_id'].nunique()


In [None]:
#Ho many patients had danger signs at delivery?

delivery_danger = delivery_df[delivery_df['danger_signs_at_delivery']==True]
delivery_danger['delivery_patient_id'].nunique()


In [None]:
#How many patients had an on-time postnatal visit?

postnatal = delivery_df[delivery_df['delivery_first_visit_on_time']==True]
postnatal['delivery_patient_id'].nunique()


In [None]:
#Of the patients who had danger signs at delivery, how many did not have an on-time postnatal visit?
risky_late = set(delivery_danger['delivery_patient_id']) - set(postnatal['delivery_patient_id'])
len(risky_late)


In [None]:
#workforce

delivery_df['delivery_chw_id'].nunique() #2039
pregnancy_visit_df['pregnancy_visit_chw_id'].nunique() #2082

#How many chws only do pregnancy visits and not deliveries?
chw_preg_visit_only = set(pregnancy_visit_df['pregnancy_visit_chw_id'])- set(delivery_df['delivery_chw_id'])
len(chw_preg_only)


In [None]:
#How many patients with a pregnancy registration do not have a delivery outcome?
print('There are ' + str(len(set(set(pregnancy_df['pregnancy_patient_id']) - set(delivery_df['delivery_patient_id'])))) + ' patients with a pregnancy registration who do not have a delivery outcome.')

#Do not have a pregnancy registration?
print('There are ' + str(len(set(set(delivery_df['delivery_patient_id']) - set(pregnancy_df['pregnancy_patient_id'])))) + ' patients with a delivery outcome who do not have a pregnancy registration.')


# Check for multicollinearity

In [None]:
#Is it true that all pregnancy visits with danger signs are referred to a health facility?

danger_visit =  pregnancy_visit_df[pregnancy_visit_df['pregnancy_visit_danger_signs'].notnull()]
danger_visit['pregnancy_visit_form_id'].nunique() #4536

not_referred = danger_visit[danger_visit['pregnancy_visit_referred_to_health_facility']==False]


Every pregnancy visit that has any danger signs is automatically referred to a health facility. This means we can't include both pregnancy_visit_danger_signs and pregnancy_visit_referred_to_health_facility in our model.

# Delivery Response Data

For reference: Sample size rule of thumb (Peduzzi)
Let p be the smallest of the proportions of negative or positive cases in the population and 
k the number of covariates (the number of independent variables), then the minimum number of cases to include is:
N = 10 k / p


In [None]:
#Delivery Outcomes
#danger_signs_at_delivery
print(delivery_df.groupby('danger_signs_at_delivery').size())
#24110

In [None]:
#facility_delivery
print(delivery_df.groupby('facility_delivery').size())

In [None]:
#first_visit_on_time
print(delivery_df.groupby('delivery_first_visit_on_time').size())

# Aggregate Data

In [1840]:
#Our population will be those patients who had a pregnancy registration, delivery danger signs data, family survey, and household

In [1841]:
#subset delivery data

has_delivery_data = delivery_df[delivery_df['danger_signs_at_delivery'].notnull()]


In [1842]:
#Join pregnancy and delivery data

pregnancy_delivery_df = pd.merge(pregnancy_df, has_delivery_data, how='inner', left_on=['pregnancy_patient_id','pregnancy_hh_id'], right_on = ['delivery_patient_id','delivery_hh_id'])



In [1843]:
pregnancy_delivery_df['delivery_patient_id'].nunique()


22155

In [1844]:
pregnancy_delivery_df.head()


Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,pregnancy_danger_signs_8,pregnancy_danger_signs_9,delivery_id,delivery_chw_id,delivery_date_time_submitted,delivery_patient_id,delivery_hh_id,facility_delivery,danger_signs_at_delivery,delivery_first_visit_on_time
0,fb4419f66aec81237d8f87c4ff0054a3,7f2d25199eef813addb4d9f783b3e075,2018-02-06,97a5f299f3254a998a3bca0ad11a154d,11d7a4f38fafa7954ce0752a1343e168,16.0,2017-08-07,2018-05-13,,,...,False,False,f5562a92c91af04ba249ea11cca6f735,7f2d25199eef813addb4d9f783b3e075,2018-03-03 08:06:41+00:00,97a5f299f3254a998a3bca0ad11a154d,11d7a4f38fafa7954ce0752a1343e168,True,False,True
1,60cee26693654bcdee37558c3efe79f8,82b888a09920a91a193b60fa68605243,2018-02-07,9769f2af62e82eb5bc661520a03739e5,2d3975a66c2fd5ec4653d151c025ba17,27.0,2017-07-29,2018-05-04,,,...,False,False,8d3616481a1ef21f4f6ab6a20c602a5d,82b888a09920a91a193b60fa68605243,2018-04-27 03:43:39+00:00,9769f2af62e82eb5bc661520a03739e5,2d3975a66c2fd5ec4653d151c025ba17,True,False,True
2,73fbadf1c54c15e4d733e80899bd5a60,0296ee306ec54e31cdd661859e4a32ef,2018-02-07,34b484a85a48ed5441c5b09de6867306,0f01fd8cf6d363411725243842b8315d,28.0,2017-08-08,2018-05-14,,,...,False,False,6c2d0a4fc1224364215906d3a815cb43,0296ee306ec54e31cdd661859e4a32ef,2018-07-18 14:57:11+00:00,34b484a85a48ed5441c5b09de6867306,0f01fd8cf6d363411725243842b8315d,True,False,True
3,46f237f3890d2e9e37cb6b0e7fb0edc3,a3abfd66e17a0e3a1303dcb3bfb937d1,2018-02-07,c697f1476734d3b8782c6288a2aaf478,d4d73db5620f13202fdb7edd0c5c21c9,25.0,2017-12-08,2018-09-13,,,...,False,False,09006aac17fd54a94965895938dc5d19,a3abfd66e17a0e3a1303dcb3bfb937d1,2018-08-17 13:49:58+00:00,c697f1476734d3b8782c6288a2aaf478,d4d73db5620f13202fdb7edd0c5c21c9,True,False,True
4,339e3fe85597e9f3af8910b410d096a4,8073c889d7526c6482820636bb0efb9e,2018-02-01,53414d387e87b6d4fe6c96e56f920488,978bba69e9454c353431f73ae69b787b,23.0,2017-08-02,2018-05-08,,r1,...,False,False,35d9d3459a2bc34de632d979d7b4cc9f,8073c889d7526c6482820636bb0efb9e,2018-05-06 07:05:54+00:00,53414d387e87b6d4fe6c96e56f920488,978bba69e9454c353431f73ae69b787b,True,False,True


In [1845]:
%%capture

#drop redundant joining columns

pregnancy_delivery_df.drop(columns=['delivery_patient_id', 'delivery_hh_id'])


In [1846]:
#preview joined data
pregnancy_delivery_df.head()


Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,pregnancy_danger_signs_8,pregnancy_danger_signs_9,delivery_id,delivery_chw_id,delivery_date_time_submitted,delivery_patient_id,delivery_hh_id,facility_delivery,danger_signs_at_delivery,delivery_first_visit_on_time
0,fb4419f66aec81237d8f87c4ff0054a3,7f2d25199eef813addb4d9f783b3e075,2018-02-06,97a5f299f3254a998a3bca0ad11a154d,11d7a4f38fafa7954ce0752a1343e168,16.0,2017-08-07,2018-05-13,,,...,False,False,f5562a92c91af04ba249ea11cca6f735,7f2d25199eef813addb4d9f783b3e075,2018-03-03 08:06:41+00:00,97a5f299f3254a998a3bca0ad11a154d,11d7a4f38fafa7954ce0752a1343e168,True,False,True
1,60cee26693654bcdee37558c3efe79f8,82b888a09920a91a193b60fa68605243,2018-02-07,9769f2af62e82eb5bc661520a03739e5,2d3975a66c2fd5ec4653d151c025ba17,27.0,2017-07-29,2018-05-04,,,...,False,False,8d3616481a1ef21f4f6ab6a20c602a5d,82b888a09920a91a193b60fa68605243,2018-04-27 03:43:39+00:00,9769f2af62e82eb5bc661520a03739e5,2d3975a66c2fd5ec4653d151c025ba17,True,False,True
2,73fbadf1c54c15e4d733e80899bd5a60,0296ee306ec54e31cdd661859e4a32ef,2018-02-07,34b484a85a48ed5441c5b09de6867306,0f01fd8cf6d363411725243842b8315d,28.0,2017-08-08,2018-05-14,,,...,False,False,6c2d0a4fc1224364215906d3a815cb43,0296ee306ec54e31cdd661859e4a32ef,2018-07-18 14:57:11+00:00,34b484a85a48ed5441c5b09de6867306,0f01fd8cf6d363411725243842b8315d,True,False,True
3,46f237f3890d2e9e37cb6b0e7fb0edc3,a3abfd66e17a0e3a1303dcb3bfb937d1,2018-02-07,c697f1476734d3b8782c6288a2aaf478,d4d73db5620f13202fdb7edd0c5c21c9,25.0,2017-12-08,2018-09-13,,,...,False,False,09006aac17fd54a94965895938dc5d19,a3abfd66e17a0e3a1303dcb3bfb937d1,2018-08-17 13:49:58+00:00,c697f1476734d3b8782c6288a2aaf478,d4d73db5620f13202fdb7edd0c5c21c9,True,False,True
4,339e3fe85597e9f3af8910b410d096a4,8073c889d7526c6482820636bb0efb9e,2018-02-01,53414d387e87b6d4fe6c96e56f920488,978bba69e9454c353431f73ae69b787b,23.0,2017-08-02,2018-05-08,,r1,...,False,False,35d9d3459a2bc34de632d979d7b4cc9f,8073c889d7526c6482820636bb0efb9e,2018-05-06 07:05:54+00:00,53414d387e87b6d4fe6c96e56f920488,978bba69e9454c353431f73ae69b787b,True,False,True


In [1847]:
family_survey_df.head()

Unnamed: 0,hh_id,how_water_treated,has_mosquito_net,latrine,hand_washing_facilities,electricity,television,cupboard,dvd,radio,clock,floor,walls,roof,fuel,toilet,highest_education_achieved,wealth_quintile
0,2c6f0fcbee6b97a4f1c2d5c22b977cc5,aqua_tabs,yes,yes,no,yes,no,no,no,yes,no,earth_sand,dung_mud_sod,roof_other,wood,toilet_other,some_primary,5.0
1,7bdf3f8e664c900770105bcb7c7c3186,aqua_tabs,yes,yes,yes,no,yes,no,no,yes,no,cement,dung_mud_sod,roof_other,wood,toilet_other,primary,5.0
2,5d69c689be3f3f70023ebf87f64185b4,missing,yes,yes,yes,no,no,no,yes,yes,no,earth_sand,dung_mud_sod,thatch_grass_ makuti,wood,toilet_other,none,1.0
3,f6ded15b6595c631dbf08557f526f4fd,missing,yes,yes,no,no,no,yes,no,yes,no,earth_sand,dung_mud_sod,roof_other,wood,toilet_other,some_secondary,2.0
4,563abcf37859abdfc1c387ae57433cc7,aqua_tabs,yes,yes,yes,yes,no,yes,no,no,yes,earth_sand,dung_mud_sod,roof_other,wood,toilet_other,secondary,1.0


In [1848]:
#join family survey data

pregnancy_delivery_survey_df = pd.merge(pregnancy_delivery_df, family_survey_df,  how='inner', left_on=['pregnancy_hh_id'], right_on = ['hh_id'])
#pregnancy_delivery_survey_df.drop(columns=['pregnancy_hh_id'])






# Flatten Pregnancy Visit Data For Joining

In [None]:
pregnancy_visit_df[pregnancy_visit_df['danger_signs_1']==True].count()
#1.3% of visits have danger signs_1

pregnancy_visit_df[pregnancy_visit_df['danger_signs_2']==True].count()
#0.3% of visits have danger_signs_2

pregnancy_visit_df[(pregnancy_visit_df['danger_signs_1']==True) | (pregnancy_visit_df['danger_signs_2']==True) | (pregnancy_visit_df['danger_signs_3']==True) | (pregnancy_visit_df['danger_signs_4']==True) | (pregnancy_visit_df['danger_signs_5']==True) | (pregnancy_visit_df['danger_signs_6']==True) | (pregnancy_visit_df['danger_signs_7']==True) | (pregnancy_visit_df['danger_signs_8']==True) | (pregnancy_visit_df['danger_signs_9']==True)].count()

##Only 3.3% of pregnancy visits have danger signs

In [1849]:
#We want to flatten pregnancy_visit_df to include patient_id, visit_count, ever_referral_follow_up_needed

In [1850]:
#recall that if danger signs are observed, a referral_follow_up_needed is automatic

In [1851]:
pregnancy_visit_counts.head()

Unnamed: 0,pregnancy_visit_patient_id,visit_count
0,0000c86de88724accadc38f7dcfb807b,3
1,00015953a5b912803c46c51b13e69665,2
2,0002ad25a615c21162597bc41e8452e7,5
3,00038238757b71fb4ace66053bea76fe,4
4,00044eaa9e3357b4f19662b6a7210a25,1


In [1852]:
pregnancy_visit_df.columns

Index(['pregnancy_visit_form_id', 'pregnancy_visit_chw_id',
       'pregnancy_visit_date_time_submitted', 'pregnancy_visit_patient_id',
       'pregnancy_visit_hh_id', 'pregnancy_visit_how_visit_conducted',
       'pregnancy_visit_referred_to_health_facility',
       'pregnancy_visit_danger_signs', 'pregnancy_visit_danger_signs_1',
       'pregnancy_visit_danger_signs_2', 'pregnancy_visit_danger_signs_3',
       'pregnancy_visit_danger_signs_4', 'pregnancy_visit_danger_signs_5',
       'pregnancy_visit_danger_signs_6', 'pregnancy_visit_danger_signs_7',
       'pregnancy_visit_danger_signs_8', 'pregnancy_visit_danger_signs_9'],
      dtype='object')

In [1853]:
#create ever_referral variable
pregnancy_referral_needed = pregnancy_visit_df.groupby(['pregnancy_visit_patient_id'], sort=False)['pregnancy_visit_referred_to_health_facility'].max().reset_index(name='ever_referral')



In [1854]:
#join flattened visit data
pregnancy_visit_flat = pd.merge(pregnancy_referral_needed, pregnancy_visit_counts, how='inner', left_on=['pregnancy_visit_patient_id'], right_on=['pregnancy_visit_patient_id'])

pregnancy_visit_flat.head()


Unnamed: 0,pregnancy_visit_patient_id,ever_referral,visit_count
0,43247b91bca109e83ff8e119daaab050,True,5
1,9a228db10e0baae37c379d98d5d47bf8,False,1
2,cc23df15b0ab238c3bccaab1513e8975,False,3
3,8593b086f11aa58304b2d4b73ee5b66a,False,1
4,e706638fd80d002bba309f979d4e0388,False,1


In [1855]:
pregnancy_delivery_survey_df.columns

Index(['pregnancy_form_id', 'pregnancy_chw_id',
       'pregnancy_date_time_submitted', 'pregnancy_patient_id',
       'pregnancy_hh_id', 'pregnancy_patient_age_in_years',
       'pregnancy_last_menstrual_period_date', 'pregnancy_expected_due_date',
       'pregnancy_danger_signs', 'pregnancy_risk_factors',
       'pregnancy_risk_factor_1', 'pregnancy_risk_factor_2',
       'pregnancy_risk_factor_3', 'pregnancy_risk_factor_4',
       'pregnancy_risk_factor_5', 'pregnancy_risk_factor_6',
       'pregnancy_danger_signs_1', 'pregnancy_danger_signs_2',
       'pregnancy_danger_signs_3', 'pregnancy_danger_signs_4',
       'pregnancy_danger_signs_5', 'pregnancy_danger_signs_6',
       'pregnancy_danger_signs_7', 'pregnancy_danger_signs_8',
       'pregnancy_danger_signs_9', 'delivery_id', 'delivery_chw_id',
       'delivery_date_time_submitted', 'delivery_patient_id', 'delivery_hh_id',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'hh_id', 'ho

In [1856]:
# join in flattened visit data

pregnancy_delivery_survey_visit_df = pd.merge(pregnancy_delivery_survey_df, pregnancy_visit_flat, how='left', left_on=['pregnancy_patient_id'], right_on=['pregnancy_visit_patient_id'])

pregnancy_delivery_survey_visit_df.head()


Unnamed: 0,pregnancy_form_id,pregnancy_chw_id,pregnancy_date_time_submitted,pregnancy_patient_id,pregnancy_hh_id,pregnancy_patient_age_in_years,pregnancy_last_menstrual_period_date,pregnancy_expected_due_date,pregnancy_danger_signs,pregnancy_risk_factors,...,floor,walls,roof,fuel,toilet,highest_education_achieved,wealth_quintile,pregnancy_visit_patient_id,ever_referral,visit_count
0,fb4419f66aec81237d8f87c4ff0054a3,7f2d25199eef813addb4d9f783b3e075,2018-02-06,97a5f299f3254a998a3bca0ad11a154d,11d7a4f38fafa7954ce0752a1343e168,16.0,2017-08-07,2018-05-13,,,...,earth_sand,dung_mud_sod,roof_other,fuel_other,toilet_other,primary,1.0,97a5f299f3254a998a3bca0ad11a154d,False,1.0
1,60cee26693654bcdee37558c3efe79f8,82b888a09920a91a193b60fa68605243,2018-02-07,9769f2af62e82eb5bc661520a03739e5,2d3975a66c2fd5ec4653d151c025ba17,27.0,2017-07-29,2018-05-04,,,...,earth_sand,dung_mud_sod,roof_other,wood,toilet_other,after_secondary,1.0,,,
2,73fbadf1c54c15e4d733e80899bd5a60,0296ee306ec54e31cdd661859e4a32ef,2018-02-07,34b484a85a48ed5441c5b09de6867306,0f01fd8cf6d363411725243842b8315d,28.0,2017-08-08,2018-05-14,,,...,earth_sand,walls_other,roof_other,wood,toilet_other,primary,3.0,34b484a85a48ed5441c5b09de6867306,False,3.0
3,46f237f3890d2e9e37cb6b0e7fb0edc3,a3abfd66e17a0e3a1303dcb3bfb937d1,2018-02-07,c697f1476734d3b8782c6288a2aaf478,d4d73db5620f13202fdb7edd0c5c21c9,25.0,2017-12-08,2018-09-13,,,...,cement,walls_other,thatch_grass_ makuti,wood,toilet_other,primary,2.0,c697f1476734d3b8782c6288a2aaf478,False,4.0
4,339e3fe85597e9f3af8910b410d096a4,8073c889d7526c6482820636bb0efb9e,2018-02-01,53414d387e87b6d4fe6c96e56f920488,978bba69e9454c353431f73ae69b787b,23.0,2017-08-02,2018-05-08,,r1,...,cement,walls_other,roof_other,wood,toilet_other,secondary,3.0,53414d387e87b6d4fe6c96e56f920488,False,6.0


In [1857]:
#drop pregnancy visit patient id
pregnancy_delivery_survey_visit_df.drop(columns=['pregnancy_visit_patient_id'])

#coerce implied missing values from left join i.e. if there was no visit, there was no referral, if there was no visit,
#visit number is zero

pregnancy_delivery_survey_visit_df['ever_referral'].fillna(False, inplace=True)

pregnancy_delivery_survey_visit_df['visit_count'].fillna(0, inplace=True)


In [1867]:
pregnancy_delivery_survey_visit_df.columns

Index(['pregnancy_form_id', 'pregnancy_chw_id',
       'pregnancy_date_time_submitted', 'pregnancy_patient_id',
       'pregnancy_hh_id', 'pregnancy_patient_age_in_years',
       'pregnancy_last_menstrual_period_date', 'pregnancy_expected_due_date',
       'pregnancy_danger_signs', 'pregnancy_risk_factors',
       'pregnancy_risk_factor_1', 'pregnancy_risk_factor_2',
       'pregnancy_risk_factor_3', 'pregnancy_risk_factor_4',
       'pregnancy_risk_factor_5', 'pregnancy_risk_factor_6',
       'pregnancy_danger_signs_1', 'pregnancy_danger_signs_2',
       'pregnancy_danger_signs_3', 'pregnancy_danger_signs_4',
       'pregnancy_danger_signs_5', 'pregnancy_danger_signs_6',
       'pregnancy_danger_signs_7', 'pregnancy_danger_signs_8',
       'pregnancy_danger_signs_9', 'delivery_id', 'delivery_chw_id',
       'delivery_date_time_submitted', 'delivery_patient_id', 'delivery_hh_id',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'hh_id', 'ho

# Add Household Size

In [1886]:
household_size.head()

Unnamed: 0,hh_id,count
0,000011214306406a544f879972ccbf07,2
1,000021464dcef94410982c36ad471369,2
2,000132dc6adcd10a7d24e11a1018cead,6
3,0001727631d6a3d1dd8e41989fcb03e3,8
4,000191355990014ac7ca8cd6aa92185f,1


In [1891]:
pregnancy_delivery_survey_visit_household_df = pd.merge(pregnancy_delivery_survey_visit_df, household_count, how='left', left_on=['pregnancy_hh_id'], right_on = ['hh_id'])
pregnancy_delivery_survey_visit_household_df['household_count'] = pregnancy_delivery_survey_visit_household_df['person_id']
pregnancy_delivery_survey_visit_household_df.drop(columns =["person_id"], inplace = True)

pregnancy_delivery_survey_visit_household_df.columns


Index(['pregnancy_form_id', 'pregnancy_chw_id',
       'pregnancy_date_time_submitted', 'pregnancy_patient_id',
       'pregnancy_hh_id', 'pregnancy_patient_age_in_years',
       'pregnancy_last_menstrual_period_date', 'pregnancy_expected_due_date',
       'pregnancy_danger_signs', 'pregnancy_risk_factors',
       'pregnancy_risk_factor_1', 'pregnancy_risk_factor_2',
       'pregnancy_risk_factor_3', 'pregnancy_risk_factor_4',
       'pregnancy_risk_factor_5', 'pregnancy_risk_factor_6',
       'pregnancy_danger_signs_1', 'pregnancy_danger_signs_2',
       'pregnancy_danger_signs_3', 'pregnancy_danger_signs_4',
       'pregnancy_danger_signs_5', 'pregnancy_danger_signs_6',
       'pregnancy_danger_signs_7', 'pregnancy_danger_signs_8',
       'pregnancy_danger_signs_9', 'delivery_id', 'delivery_chw_id',
       'delivery_date_time_submitted', 'delivery_patient_id', 'delivery_hh_id',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'hh_id', 'ho

In [1893]:
pregnancy_delivery_survey_visit_df = pregnancy_delivery_survey_visit_household_df

# Convert Data Types

In [1866]:
# Create days_pregnant at time of registration column

pregnancy_delivery_survey_visit_df['days_pregnant'] = pregnancy_delivery_survey_visit_df['pregnancy_date_time_submitted'] - pregnancy_delivery_survey_visit_household_df['pregnancy_last_menstrual_period_date']

#pregnancy_delivery_survey_visit_df['days_pregnant']


In [1894]:
pregnancy_delivery_survey_visit_df.columns

Index(['pregnancy_form_id', 'pregnancy_chw_id',
       'pregnancy_date_time_submitted', 'pregnancy_patient_id',
       'pregnancy_hh_id', 'pregnancy_patient_age_in_years',
       'pregnancy_last_menstrual_period_date', 'pregnancy_expected_due_date',
       'pregnancy_danger_signs', 'pregnancy_risk_factors',
       'pregnancy_risk_factor_1', 'pregnancy_risk_factor_2',
       'pregnancy_risk_factor_3', 'pregnancy_risk_factor_4',
       'pregnancy_risk_factor_5', 'pregnancy_risk_factor_6',
       'pregnancy_danger_signs_1', 'pregnancy_danger_signs_2',
       'pregnancy_danger_signs_3', 'pregnancy_danger_signs_4',
       'pregnancy_danger_signs_5', 'pregnancy_danger_signs_6',
       'pregnancy_danger_signs_7', 'pregnancy_danger_signs_8',
       'pregnancy_danger_signs_9', 'delivery_id', 'delivery_chw_id',
       'delivery_date_time_submitted', 'delivery_patient_id', 'delivery_hh_id',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'hh_id', 'ho

In [1895]:
pregnancy_delivery_survey_visit_df_subset = pregnancy_delivery_survey_visit_df[['pregnancy_patient_age_in_years',
       'pregnancy_risk_factor_1', 'pregnancy_risk_factor_2',
       'pregnancy_risk_factor_3', 'pregnancy_risk_factor_4',
       'pregnancy_risk_factor_5', 'pregnancy_risk_factor_6',
       'pregnancy_danger_signs_1', 'pregnancy_danger_signs_2',
       'pregnancy_danger_signs_3', 'pregnancy_danger_signs_4',
       'pregnancy_danger_signs_5', 'pregnancy_danger_signs_6',
       'pregnancy_danger_signs_7', 'pregnancy_danger_signs_8',
       'pregnancy_danger_signs_9', 
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'how_water_treated',
       'has_mosquito_net', 'latrine', 'hand_washing_facilities', 'electricity',
       'television', 'cupboard', 'dvd', 'radio', 'clock', 'floor', 'walls',
       'roof', 'fuel', 'toilet', 'highest_education_achieved',
       'wealth_quintile', 'ever_referral',
       'visit_count', 'days_pregnant', 'household_count']]

In [1896]:
#To prevent the encoding of atrificial information that exists when we transform categorical variables to 
#integer data (i.e. an order that does not exists naturally in the data), we will implement one-hot encoding

In [1898]:
pregnancy_delivery_survey_visit_df['how_water_treated']

0          missing
1        aqua_tabs
2        aqua_tabs
3          missing
4             boil
5          missing
6        aqua_tabs
7             boil
8          missing
9           filter
10       aqua_tabs
11       aqua_tabs
12         missing
13         missing
14         missing
15       aqua_tabs
16            boil
17          filter
18       aqua_tabs
19            boil
20          filter
21          filter
22            boil
23         missing
24         missing
25         missing
26         missing
27         missing
28       aqua_tabs
29         missing
           ...    
22438    aqua_tabs
22439    aqua_tabs
22440       filter
22441      missing
22442      missing
22443      missing
22444    aqua_tabs
22445    aqua_tabs
22446         boil
22447      missing
22448    aqua_tabs
22449    aqua_tabs
22450      missing
22451    aqua_tabs
22452       filter
22453    aqua_tabs
22454    aqua_tabs
22455    aqua_tabs
22456    aqua_tabs
22457      missing
22458      missing
22459       

In [1899]:
categorical_cols = [
 'how_water_treated',
 'has_mosquito_net',
 'latrine',
 'hand_washing_facilities',
 'electricity',
 'television',
 'cupboard',
 'dvd',
 'radio',
 'clock',
 'floor',
 'walls',
 'roof',
 'fuel',
 'toilet',
    'pregnancy_risk_factor_1', 'pregnancy_risk_factor_2',
       'pregnancy_risk_factor_3', 'pregnancy_risk_factor_4',
       'pregnancy_risk_factor_5', 'pregnancy_risk_factor_6',
       'pregnancy_danger_signs_1', 'pregnancy_danger_signs_2',
       'pregnancy_danger_signs_3', 'pregnancy_danger_signs_4',
       'pregnancy_danger_signs_5', 'pregnancy_danger_signs_6',
       'pregnancy_danger_signs_7', 'pregnancy_danger_signs_8',
       'pregnancy_danger_signs_9', 'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time',
 'highest_education_achieved', 'ever_referral'
]


In [1900]:
pregnancy_delivery_survey_visit_df_subset[categorical_cols].head()

Unnamed: 0,how_water_treated,has_mosquito_net,latrine,hand_washing_facilities,electricity,television,cupboard,dvd,radio,clock,...,pregnancy_danger_signs_5,pregnancy_danger_signs_6,pregnancy_danger_signs_7,pregnancy_danger_signs_8,pregnancy_danger_signs_9,facility_delivery,danger_signs_at_delivery,delivery_first_visit_on_time,highest_education_achieved,ever_referral
0,missing,yes,yes,yes,no,yes,no,no,yes,no,...,False,False,False,False,False,True,False,True,primary,False
1,aqua_tabs,no,yes,yes,no,yes,yes,yes,yes,no,...,False,False,False,False,False,True,False,True,after_secondary,False
2,aqua_tabs,yes,yes,yes,no,yes,no,no,yes,no,...,False,False,False,False,False,True,False,True,primary,False
3,missing,yes,yes,yes,no,yes,no,no,yes,no,...,False,False,False,False,False,True,False,True,primary,False
4,boil,no,yes,yes,yes,no,yes,no,yes,no,...,False,False,False,False,False,True,False,True,secondary,False


In [1901]:
#First we will will use the LabelEncoder to convert categorical text data to categorical numerical data

# import labelencoder
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
labelencoder = LabelEncoder()

# apply le on categorical feature columns
pregnancy_delivery_survey_visit_df_subset[categorical_cols] = pregnancy_delivery_survey_visit_df_subset[categorical_cols].apply(lambda col: labelencoder.fit_transform(col))
pregnancy_delivery_survey_visit_df_subset[categorical_cols].head(10)


Unnamed: 0,how_water_treated,has_mosquito_net,latrine,hand_washing_facilities,electricity,television,cupboard,dvd,radio,clock,...,pregnancy_danger_signs_5,pregnancy_danger_signs_6,pregnancy_danger_signs_7,pregnancy_danger_signs_8,pregnancy_danger_signs_9,facility_delivery,danger_signs_at_delivery,delivery_first_visit_on_time,highest_education_achieved,ever_referral
0,3,1,1,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,3,0
1,0,0,1,1,0,1,1,1,1,0,...,0,0,0,0,0,1,0,1,0,0
2,0,1,1,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,3,0
3,3,1,1,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,3,0
4,1,0,1,1,1,0,1,0,1,0,...,0,0,0,0,0,1,0,1,4,0
5,3,1,1,1,0,0,0,0,1,0,...,0,0,0,1,1,1,0,1,5,0
6,0,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,1,5,0
7,1,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,6,0
8,3,1,1,1,0,0,1,0,1,0,...,0,0,0,0,0,1,0,1,5,0
9,2,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,1,6,0


In [1902]:
#Next we will implement one-hot encoding for those variables with more than two categories

In [1876]:
print(pregnancy_delivery_survey_visit_df_subset['has_mosquito_net'].unique())
print(pregnancy_delivery_survey_visit_df_subset['latrine'].unique())
print(pregnancy_delivery_survey_visit_df_subset['hand_washing_facilities'].unique())
print(pregnancy_delivery_survey_visit_df_subset['electricity'].unique())
print(pregnancy_delivery_survey_visit_df_subset['television'].unique())
print(pregnancy_delivery_survey_visit_df_subset['cupboard'].unique())
print(pregnancy_delivery_survey_visit_df_subset['dvd'].unique())
print(pregnancy_delivery_survey_visit_df_subset['radio'].unique())
print(pregnancy_delivery_survey_visit_df_subset['clock'].unique())
print('floor:' + str(pregnancy_delivery_survey_visit_df_subset['floor'].unique()))
print(pregnancy_delivery_survey_visit_df_subset['walls'].unique())
print(pregnancy_delivery_survey_visit_df_subset['roof'].unique())
print('fuel:' + str(pregnancy_delivery_survey_visit_df_subset['fuel'].unique()))
print(pregnancy_delivery_survey_visit_df_subset['toilet'].unique())


[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[0 1]
[0 1]
[1 0]
[0 1]
floor:[1 0 2]
[0 1]
[0 1]
fuel:[0 2 1]
[1 0]


In [1884]:
family_survey_df.columns

Index(['hh_id', 'how_water_treated', 'has_mosquito_net', 'latrine',
       'hand_washing_facilities', 'electricity', 'television', 'cupboard',
       'dvd', 'radio', 'clock', 'floor', 'walls', 'roof', 'fuel', 'toilet',
       'highest_education_achieved', 'wealth_quintile'],
      dtype='object')

In [1907]:
#Create dummy vars 

#floor
df_floor = pd.get_dummies(pregnancy_delivery_survey_visit_df_subset['floor'], drop_first=True, prefix='floor')
df_floor

df_subset_floor = pd.concat([pregnancy_delivery_survey_visit_df_subset, df_floor], axis=1)
df_subset_floor

#fuel
df_fuel = pd.get_dummies(pregnancy_delivery_survey_visit_df_subset['fuel'], drop_first=True, prefix='fuel')
df_fuel

df_subset_floor_fuel = pd.concat([df_subset_floor, df_fuel], axis=1)
df_subset_floor_fuel

#education
df_edu = pd.get_dummies(pregnancy_delivery_survey_visit_df_subset['highest_education_achieved'], drop_first=True, prefix='education')
df_edu

df_subset_floor_fuel_edu = pd.concat([df_subset_floor_fuel, df_edu], axis=1)
df_subset_floor_fuel_edu

#water treatment
df_wat = pd.get_dummies(pregnancy_delivery_survey_visit_df_subset['how_water_treated'], drop_first=True, prefix='water')
df_wat

df_subset_floor_fuel_edu_wat = pd.concat([df_subset_floor_fuel_edu, df_wat], axis=1)
df_subset_floor_fuel_edu_wat


df_subset_floor_fuel_edu_wat.drop(columns=['floor', 'fuel', 'highest_education_achieved', 'how_water_treated'], inplace=True)
df_subset_floor_fuel_edu_wat.columns

Index(['pregnancy_patient_age_in_years', 'pregnancy_risk_factor_1',
       'pregnancy_risk_factor_2', 'pregnancy_risk_factor_3',
       'pregnancy_risk_factor_4', 'pregnancy_risk_factor_5',
       'pregnancy_risk_factor_6', 'pregnancy_danger_signs_1',
       'pregnancy_danger_signs_2', 'pregnancy_danger_signs_3',
       'pregnancy_danger_signs_4', 'pregnancy_danger_signs_5',
       'pregnancy_danger_signs_6', 'pregnancy_danger_signs_7',
       'pregnancy_danger_signs_8', 'pregnancy_danger_signs_9',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'has_mosquito_net', 'latrine',
       'hand_washing_facilities', 'electricity', 'television', 'cupboard',
       'dvd', 'radio', 'clock', 'walls', 'roof', 'toilet', 'wealth_quintile',
       'ever_referral', 'visit_count', 'days_pregnant', 'household_count',
       'floor_1', 'floor_2', 'fuel_1', 'fuel_2', 'education_1', 'education_2',
       'education_3', 'education_4', 'education_5', 'education_

In [1929]:

cols = list(df_subset_floor_fuel_edu_wat.columns)

#Convert columns to numeric
df_subset_floor_fuel_edu_wat.loc[:, cols] = df_subset_floor_fuel_edu_wat.loc[:, cols].apply(pd.to_numeric)



0        15811200000000000
1        16675200000000000
2        15811200000000000
3         5270400000000000
4        15811200000000000
5        15033600000000000
6        16070400000000000
7        19353600000000000
8        23760000000000000
9        18576000000000000
10       21081600000000000
11       20304000000000000
12       15206400000000000
13       20649600000000000
14       16761600000000000
15       19267200000000000
16       22723200000000000
17        5529600000000000
18       10972800000000000
19       16070400000000000
20          86400000000000
21       13824000000000000
22       16761600000000000
23        7776000000000000
24       13219200000000000
25        7862400000000000
26        8121600000000000
27       18489600000000000
28        9244800000000000
29       24451200000000000
               ...        
22438    23414400000000000
22439     2160000000000000
22440    21081600000000000
22441    24105600000000000
22442    24451200000000000
22443    21081600000000000
2

# Build Models

In [1930]:
#separate response variables from #explanatory variables

df_subset_floor_fuel_edu_wat.columns

Index(['pregnancy_patient_age_in_years', 'pregnancy_risk_factor_1',
       'pregnancy_risk_factor_2', 'pregnancy_risk_factor_3',
       'pregnancy_risk_factor_4', 'pregnancy_risk_factor_5',
       'pregnancy_risk_factor_6', 'pregnancy_danger_signs_1',
       'pregnancy_danger_signs_2', 'pregnancy_danger_signs_3',
       'pregnancy_danger_signs_4', 'pregnancy_danger_signs_5',
       'pregnancy_danger_signs_6', 'pregnancy_danger_signs_7',
       'pregnancy_danger_signs_8', 'pregnancy_danger_signs_9',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'has_mosquito_net', 'latrine',
       'hand_washing_facilities', 'electricity', 'television', 'cupboard',
       'dvd', 'radio', 'clock', 'walls', 'roof', 'toilet', 'wealth_quintile',
       'ever_referral', 'visit_count', 'days_pregnant', 'household_count',
       'floor_1', 'floor_2', 'fuel_1', 'fuel_2', 'education_1', 'education_2',
       'education_3', 'education_4', 'education_5', 'education_

# Kitchen Sink KNN Model

In [1931]:
X = df_subset_floor_fuel_edu_wat[['pregnancy_patient_age_in_years', 'pregnancy_risk_factor_1',
       'pregnancy_risk_factor_2', 'pregnancy_risk_factor_3',
       'pregnancy_risk_factor_4', 'pregnancy_risk_factor_5',
       'pregnancy_risk_factor_6', 'pregnancy_danger_signs_1',
       'pregnancy_danger_signs_2', 'pregnancy_danger_signs_3',
       'pregnancy_danger_signs_4', 'pregnancy_danger_signs_5',
       'pregnancy_danger_signs_6', 'pregnancy_danger_signs_7',
       'pregnancy_danger_signs_8', 'pregnancy_danger_signs_9','has_mosquito_net', 'latrine',
       'hand_washing_facilities', 'electricity', 'television', 'cupboard',
       'dvd', 'radio', 'clock', 'walls', 'roof', 'toilet', 'wealth_quintile',
       'ever_referral', 'visit_count', 'days_pregnant', 'household_count',
       'floor_1', 'floor_2', 'fuel_1', 'fuel_2', 'education_1', 'education_2',
       'education_3', 'education_4', 'education_5', 'education_6', 'water_1',
       'water_2', 'water_3']]


Unnamed: 0,pregnancy_patient_age_in_years,pregnancy_risk_factor_1,pregnancy_risk_factor_2,pregnancy_risk_factor_3,pregnancy_risk_factor_4,pregnancy_risk_factor_5,pregnancy_risk_factor_6,pregnancy_danger_signs_1,pregnancy_danger_signs_2,pregnancy_danger_signs_3,...,fuel_2,education_1,education_2,education_3,education_4,education_5,education_6,water_1,water_2,water_3
0,16.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,27.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,28.0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,25.0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
4,23.0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
5,27.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
6,25.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
7,31.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,1,0,0
8,26.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
9,33.0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [1932]:
y = df_subset_floor_fuel_edu_wat[['danger_signs_at_delivery']]

In [1933]:
#Split the data into training and test sets

##Separate the target variable from the rest of the data
#def split_target(data, target_name):
 #   target = data[[target_name]]
 #   data.drop(target_name, axis=1, inplace=True)
 #   return (data, target)

#X, y = split_target(pregnancy_delivery_survey_df_subset, 'facility_delivery') #y holds response variable, X holds our dataset



In [1934]:
#Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1234 #for reproducibility
)

In [1935]:
len(X)

22468

In [1936]:
len(X_train)

16851

In [1937]:
X.columns

Index(['pregnancy_patient_age_in_years', 'pregnancy_risk_factor_1',
       'pregnancy_risk_factor_2', 'pregnancy_risk_factor_3',
       'pregnancy_risk_factor_4', 'pregnancy_risk_factor_5',
       'pregnancy_risk_factor_6', 'pregnancy_danger_signs_1',
       'pregnancy_danger_signs_2', 'pregnancy_danger_signs_3',
       'pregnancy_danger_signs_4', 'pregnancy_danger_signs_5',
       'pregnancy_danger_signs_6', 'pregnancy_danger_signs_7',
       'pregnancy_danger_signs_8', 'pregnancy_danger_signs_9',
       'has_mosquito_net', 'latrine', 'hand_washing_facilities', 'electricity',
       'television', 'cupboard', 'dvd', 'radio', 'clock', 'walls', 'roof',
       'toilet', 'wealth_quintile', 'ever_referral', 'visit_count',
       'days_pregnant', 'household_count', 'floor_1', 'floor_2', 'fuel_1',
       'fuel_2', 'education_1', 'education_2', 'education_3', 'education_4',
       'education_5', 'education_6', 'water_1', 'water_2', 'water_3'],
      dtype='object')

In [1938]:
len(y_train)

16851

In [1939]:
len(X_test)

5617

In [1940]:
len(y_test)

5617

In [1941]:
y_train.head()

Unnamed: 0,danger_signs_at_delivery
13199,0
19944,0
11452,1
1332,0
7826,0


In [1942]:
#Check the number of positive and negative responses in the response variable 
print(y_train.groupby('danger_signs_at_delivery').size())

danger_signs_at_delivery
0    16442
1      409
dtype: int64


In [1943]:
#NumPy array conversion for scikitlearn

#First, we save the final column names, which will assist us when we assess variable importance later on:

X_train_cols = X_train.columns
X_test_cols = X_test.columns

#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
X_train = X_train.values
X_test = X_test.values

In [1944]:
#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
y_train = y_train.values
y_test = y_test.values

In [1958]:
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train) 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
#predict response
pred = knn.predict(X_test)

# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)


  after removing the cwd from sys.path.


accuracy_score is :0.9740074773010504
confusion matrix: [[5471    0]
 [ 146    0]]
recall_score is 0.0
precision_score is 0.0
f1_score is 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Simple KNN model

In [1980]:
#Let's try a simpler KNN model and just look at the number of visits and referral
X_simple = df_subset_floor_fuel_edu_wat[[
       'ever_referral', 'visit_count']]

#Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_simple, y, test_size=0.25, random_state=1234 #for reproducibility
)

#Check the number of positive and negative responses in the response variable 
print(y_train.groupby('danger_signs_at_delivery').size())

danger_signs_at_delivery
0    16442
1      409
dtype: int64


In [1960]:
#NumPy array conversion for scikitlearn

#First, we save the final column names, which will assist us when we assess variable importance later on:

#X_train_cols = X_train.columns
#X_test_cols = X_test.columns

#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
#X_train = X_train.values
#X_test = X_test.values

#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
#y_train = y_train.values
#y_test = y_test.values

In [1981]:
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train) 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
#predict response
pred = knn.predict(X_test)

# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)


  after removing the cwd from sys.path.


accuracy_score is :0.9740074773010504
confusion matrix: [[5471    0]
 [ 146    0]]
recall_score is 0.0
precision_score is 0.0
f1_score is 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
#still failing

# Logistic Regression: Kitchen Sink

In [1976]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1234 #for reproducibility
)

#Check the number of positive and negative responses in the response variable 
print(y_train.groupby('danger_signs_at_delivery').size())

danger_signs_at_delivery
0    16442
1      409
dtype: int64


In [1977]:
type(y_train)

pandas.core.frame.DataFrame

In [1994]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

In [1995]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[5471,    0],
       [ 146,    0]])

In [1996]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_simple, y, test_size=0.25, random_state=1234 #for reproducibility
)

#Check the number of positive and negative responses in the response variable 
print(y_train.groupby('danger_signs_at_delivery').size())

danger_signs_at_delivery
0    16442
1      409
dtype: int64


In [1999]:
X_train.head()

Unnamed: 0,ever_referral,visit_count
13199,0,4.0
19944,0,1.0
11452,0,2.0
1332,0,5.0
7826,0,6.0


In [2000]:
y_train.head()

Unnamed: 0,danger_signs_at_delivery
13199,0
19944,0
11452,1
1332,0
7826,0


In [2001]:
X_test.head()

Unnamed: 0,ever_referral,visit_count
5157,0,10.0
9502,0,6.0
12139,0,1.0
13026,0,1.0
12571,0,1.0


In [2002]:
y_test.head()

Unnamed: 0,danger_signs_at_delivery
5157,0
9502,0
12139,0
13026,0
12571,0


In [1997]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

In [1998]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[5471,    0],
       [ 146,    0]])

# Try Other Outcome Variables

In [2004]:
df_subset_floor_fuel_edu_wat.columns

Index(['pregnancy_patient_age_in_years', 'pregnancy_risk_factor_1',
       'pregnancy_risk_factor_2', 'pregnancy_risk_factor_3',
       'pregnancy_risk_factor_4', 'pregnancy_risk_factor_5',
       'pregnancy_risk_factor_6', 'pregnancy_danger_signs_1',
       'pregnancy_danger_signs_2', 'pregnancy_danger_signs_3',
       'pregnancy_danger_signs_4', 'pregnancy_danger_signs_5',
       'pregnancy_danger_signs_6', 'pregnancy_danger_signs_7',
       'pregnancy_danger_signs_8', 'pregnancy_danger_signs_9',
       'facility_delivery', 'danger_signs_at_delivery',
       'delivery_first_visit_on_time', 'has_mosquito_net', 'latrine',
       'hand_washing_facilities', 'electricity', 'television', 'cupboard',
       'dvd', 'radio', 'clock', 'walls', 'roof', 'toilet', 'wealth_quintile',
       'ever_referral', 'visit_count', 'days_pregnant', 'household_count',
       'floor_1', 'floor_2', 'fuel_1', 'fuel_2', 'education_1', 'education_2',
       'education_3', 'education_4', 'education_5', 'education_

In [2005]:
y = df_subset_floor_fuel_edu_wat[['facility_delivery']]

# Simple

In [2025]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_simple, y, test_size=0.25, random_state=1234 #for reproducibility
)

#Check the number of positive and negative responses in the response variable 
print(y_train.groupby('facility_delivery').size())

facility_delivery
0     2227
1    14624
dtype: int64


In [2026]:
import statsmodels.api as sm

logit = sm.Logit(y_train, X_train)

# fit the model
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.480003
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:      facility_delivery   No. Observations:                16851
Model:                          Logit   Df Residuals:                    16849
Method:                           MLE   Df Model:                            1
Date:                Tue, 16 Apr 2019   Pseudo R-squ.:                 -0.2293
Time:                        13:12:19   Log-Likelihood:                -8088.5
converged:                       True   LL-Null:                       -6579.8
                                        LLR p-value:                     1.000
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
ever_referral     0.0687      0.080      0.858      0.391      -0.088       0.226
visit_count       0.

In [2027]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

In [2028]:
# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, y_pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, y_pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, y_pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, y_pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, y_pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)

accuracy_score is :0.8629161474096493
confusion matrix: [[   0  770]
 [   0 4847]]
recall_score is 1.0
precision_score is 0.8629161474096493
f1_score is 0.926414373088685


# Next

In [2019]:
X_medium = df_subset_floor_fuel_edu_wat[['pregnancy_patient_age_in_years', 'electricity','wealth_quintile',
       'ever_referral', 'visit_count', 'household_count',
       ]]

In [2020]:
X_train, X_test, y_train, y_test = train_test_split(
    X_medium, y, test_size=0.25, random_state=1234 #for reproducibility
)


In [2021]:
logit = sm.Logit(y_train, X_train)

# fit the model
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.402626
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:      facility_delivery   No. Observations:                16851
Model:                          Logit   Df Residuals:                    16845
Method:                           MLE   Df Model:                            5
Date:                Tue, 16 Apr 2019   Pseudo R-squ.:                -0.03113
Time:                        13:09:07   Log-Likelihood:                -6784.6
converged:                       True   LL-Null:                       -6579.8
                                        LLR p-value:                     1.000
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
pregnancy_patient_age_in_years     0.0440      0.002     20.596     

In [2023]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

In [2024]:
# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, y_pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, y_pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, y_pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, y_pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, y_pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)

accuracy_score is :0.8629161474096493
confusion matrix: [[   0  770]
 [   0 4847]]
recall_score is 1.0
precision_score is 0.8629161474096493
f1_score is 0.926414373088685


In [2007]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

In [2008]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[   0,  770],
       [   0, 4847]])

In [2009]:
# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, y_pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, y_pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, y_pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, y_pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, y_pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)

accuracy_score is :0.8629161474096493
confusion matrix: [[   0  770]
 [   0 4847]]
recall_score is 1.0
precision_score is 0.8629161474096493
f1_score is 0.926414373088685


# Kitchen Sink

In [2010]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1234 #for reproducibility
)

#Check the number of positive and negative responses in the response variable 
print(y_train.groupby('facility_delivery').size())

facility_delivery
0     2227
1    14624
dtype: int64


In [2016]:
import statsmodels.api as sm

logit = sm.Logit(y_train, X_train)

# fit the model
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.391609
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:      facility_delivery   No. Observations:                16851
Model:                          Logit   Df Residuals:                    16850
Method:                           MLE   Df Model:                            0
Date:                Tue, 16 Apr 2019   Pseudo R-squ.:               -0.002920
Time:                        13:02:38   Log-Likelihood:                -6599.0
converged:                       True   LL-Null:                       -6579.8
                                        LLR p-value:                       nan
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
pregnancy_patient_age_in_years     0.0084      0.003      2.770     

In [2013]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

AttributeError: 'DataFrame' object has no attribute 'ravel'

In [2012]:
# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, y_pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, y_pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, y_pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, y_pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, y_pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)

accuracy_score is :0.13708385259035072
confusion matrix: [[ 770    0]
 [4847    0]]
recall_score is 0.0
precision_score is 0.0
f1_score is 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [1973]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

clfs = [LogisticRegression()]

for clf in clfs:
    clf.fit(X_train, y_train.ravel())
    print(type(clf))
    print('Training accuracy: ' + str(clf.score(X_train, y_train)))
#    print('Validation accuracy: ' + str(clf.score(X_test, y_test)))
    
    coefs = {
        'column': [X_train_cols[i] for i in range(len(X_train_cols))],
        'coef': [clf.coef_[0,i] for i in range(len(X_train_cols))]
    }
    df_coefs = pd.DataFrame(coefs)
    print(df_coefs.sort_values('coef', axis=0, ascending=False))
    
    #In general, features with coefficients that are farther from zero are the most positively/negatively correlated
    #with the outcome. However, we did not scale the data prior to training, so it is possible that more 
    #important predictors that are not scaled appropriately will have lower coefficients.

<class 'sklearn.linear_model.logistic.LogisticRegression'>
Training accuracy: 0.975728443415821
Validation accuracy: 0.9740074773010504
          column      coef
1    visit_count -0.005788
0  ever_referral -0.289513




In [1972]:
#predict response
pred = clf.predict(X_test)

# evaluate accuracy: fraction of samples correctly predicted
#not the best for rare events
from sklearn.metrics import accuracy_score
print('accuracy_score is :' + str(accuracy_score(y_test, pred)))

from sklearn.metrics import confusion_matrix
print('confusion matrix: ' + str(confusion_matrix(y_test, pred)))

#the fraction of positives events that you predicted correctly
from sklearn.metrics import recall_score
print('recall_score is ' + str(recall_score(y_test, pred)))

# the fraction of predicted positives events that are actually positive 
from sklearn.metrics import precision_score
print('precision_score is ' + str(precision_score(y_test, pred)))

#The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
from sklearn.metrics import f1_score
print('f1_score is ' + str(f1_score(y_test, pred)))
#accuracy_score(df.actual_label.values, df.predicted_RF.values)


accuracy_score is :0.9740074773010504
confusion matrix: [[5471    0]
 [ 146    0]]
recall_score is 0.0
precision_score is 0.0
f1_score is 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
df_coefs

In [None]:
score = clf.score(X_test, y_test)
print(score)

# Danger Signs At Delivery

In [None]:
pregnancy_delivery_survey_df_subset.head()

In [None]:
#Split the data into training and test sets

#Separate the target variable from the rest of the data
def split_target(data, target_name):
    target = data[[target_name]]
    data.drop(target_name, axis=1, inplace=True)
    return (data, target)


X, y = split_target(pregnancy_delivery_survey_df_subset, 'danger_signs_at_delivery') #y holds response variable, X holds our dataset

#Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1234 #for reproducibility
)

#NumPy array conversion for scikitlearn

#First, we save the final column names, which will assist us when we assess variable importance later on:

X_train_cols = X_train.columns
X_test_cols = X_test.columns

#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
X_train = X_train.values
X_test = X_test.values

In [None]:
#Check the number of positive and negative responses in the response variable 
#- do we need to do any upsampling or downsampling of the data?
print(y_train.groupby('danger_signs_at_delivery').size())

In [None]:
#NumPy array conversion for scikitlearn

#First, we save the final column names, which will assist us when we assess variable importance later on:

X_train_cols = X_train.columns
X_test_cols = X_test.columns

#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
X_train = X_train.values
X_test = X_test.values

In [None]:
#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
y_train = y_train.values
y_test = y_test.values

In [None]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

clfs = [LogisticRegression()]

for clf in clfs:
    clf.fit(X_train, y_train.ravel())
    print(type(clf))
    print('Training accuracy: ' + str(clf.score(X_train, y_train)))
 #   print('Validation accuracy: ' + str(clf.score(X_test, y_test)))
    
    coefs = {
        'column': [X_train_cols[i] for i in range(len(X_train_cols))],
        'coef': [clf.coef_[0,i] for i in range(len(X_train_cols))]
    }
    df_coefs = pd.DataFrame(coefs)
    print(df_coefs.sort_values('coef', axis=0, ascending=False))
    
    #In general, features with coefficients that are farther from zero are the most positively/negatively correlated
    #with the outcome. However, we did not scale the data prior to training, so it is possible that more 
    #important predictors that are not scaled appropriately will have lower coefficients.

In [None]:
score = clf.score(X_test, y_test)
print(score)

In [None]:
############################################

In [None]:
pregnancy_delivery_df.columns

In [None]:
oops!

In [None]:
pregnancy_delivery_df_subset = pregnancy_delivery_df[['risk_factor_1', 'risk_factor_2', 'risk_factor_3', 'risk_factor_4',
       'risk_factor_5', 'risk_factor_6', 'danger_signs_1', 'danger_signs_2',
       'danger_signs_3', 'danger_signs_4', 'danger_signs_5', 'danger_signs_6',
       'danger_signs_7', 'danger_signs_8', 'danger_signs_9', 'household_count','delivery_facility_delivery']]

In [None]:
#Split the data into training and test sets

#Separate the target variable from the rest of the data
def split_target(data, target_name):
    target = data[[target_name]]
    data.drop(target_name, axis=1, inplace=True)
    return (data, target)

X, y = split_target(pregnancy_delivery_df_subset, 'delivery_facility_delivery') #y holds response variable, X holds our dataset

#Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1234 #for reproducibility
)

In [None]:
#Check the number of positive and negative responses in the response variable 
#- do we need to do any upsampling or downsampling of the data?
print(y_train.groupby('delivery_facility_delivery').size())

In [None]:
X_train

In [None]:
#NumPy array conversion for scikitlearn

#First, we save the final column names, which will assist us when we assess variable importance later on:

X_train_cols = X_train.columns
X_test_cols = X_test.columns

#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
X_train = X_train.values
X_test = X_test.values

In [None]:
#Now, we use the values attribute of the pandas DataFrames to access the underlying NumPy array for each DataFrame:
y_train = y_train.values
y_test = y_test.values

In [None]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

clfs = [LogisticRegression()]

for clf in clfs:
    clf.fit(X_train, y_train.ravel())
    print(type(clf))
    print('Training accuracy: ' + str(clf.score(X_train, y_train)))
 #   print('Validation accuracy: ' + str(clf.score(X_test, y_test)))
    
    coefs = {
        'column': [X_train_cols[i] for i in range(len(X_train_cols))],
        'coef': [clf.coef_[0,i] for i in range(len(X_train_cols))]
    }
    df_coefs = pd.DataFrame(coefs)
    print(df_coefs.sort_values('coef', axis=0, ascending=False))
    
    #In general, features with coefficients that are farther from zero are the most positively/negatively correlated
    #with the outcome. However, we did not scale the data prior to training, so it is possible that more 
    #important predictors that are not scaled appropriately will have lower coefficients.

In [None]:
df_coefs