In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
#import missingno as msno 
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

In [137]:
system = 'linux'

# Prenatal Data
parent = os.path.dirname(os.getcwd())
if system == 'windows':
    dataPath = os.path.join(parent, r"Data\MOMI\Final_Prenatal_DeIdentified.xlsx")
    prenatal = pd.read_excel('file:\\' + dataPath)
else:
    dataPath = os.path.join(parent, r"Data/MOMI/Final_Prenatal_DeIdentified.xlsx")
    prenatal = pd.read_excel('file://' + dataPath)

# MOMI Data
parent = os.path.dirname(os.getcwd())
if system == 'windows':
    dataPath = os.path.join(parent, r"Data\MOMI\Final_MOMI_DeIdentified_Update_39Mar2021.xlsx")
    momi = pd.read_excel('file:\\' + dataPath)
else:
    dataPath = os.path.join(parent, r"Data/MOMI/Final_MOMI_DeIdentified_Update_39Mar2021.xlsx")
    momi = pd.read_excel('file://' + dataPath)

# Ultrasound Data
parent = os.path.dirname(os.getcwd())
if system == 'windows':
    dataPath = os.path.join(parent, r"Data\MOMI\Final_Ultrasound_DeIdentified.xlsx")
    ultrasound = pd.read_excel('file:\\' + dataPath)
else: 
    dataPath = os.path.join(parent, r"Data/MOMI/Final_Ultrasound_DeIdentified.xlsx")
    ultrasound = pd.read_excel('file://' + dataPath)

# Fix MOMI missing values to np.NaN
momi['MIDBV'] = np.where(momi['MIDBV'] == 99, np.NaN, momi['MIDBV'])
momi['MIDCHLAM'] = np.where(momi['MIDCHLAM'] == 99, np.NaN, momi['MIDCHLAM'])
momi['MIDCONDY'] = np.where(momi['MIDCONDY'] == 99, np.NaN, momi['MIDCONDY'])
momi['MIDGC'] = np.where(momi['MIDGC'] == 99, np.NaN, momi['MIDGC'])
momi['MIDHEPB'] = np.where(momi['MIDHEPB'] == 99, np.NaN, momi['MIDHEPB'])
momi['MIDTRICH'] = np.where(momi['MIDTRICH'] == 99, np.NaN, momi['MIDTRICH'])
momi['MIDGBS'] = np.where(momi['MIDGBS'] == 99, np.NaN, momi['MIDGBS'])
momi['MomEducation_State'] = np.where(momi['MomEducation_State'] == 'Unknown', np.NaN, momi['MomEducation_State'])
momi['DadEducation_State'] = np.where(momi['DadEducation_State'] == 'Unknown', np.NaN, momi['DadEducation_State'])
momi['Smoke_b'] = np.where(momi['Smoke_b'] == 'Unknown (unable to assess)', np.NaN, momi['Smoke_b'])
momi['Race'] = np.where(momi['Race'].isin(['9', 'A']), np.NaN, momi['Race'])
momi['Ethnicity'] = np.where(momi['Ethnicity'].isin(['UNSPECIFIED']), np.NaN, momi['Ethnicity'])
momi['InfSex'] = np.where(momi['InfSex'] == 'U', np.NaN, momi['InfSex'])
momi['InfSex'] = np.where(momi['InfSex'] == 'f', 'F', momi['InfSex'])


# Dropping erroneous prenatal data. This data does not actually exist, is thousands of missing values
prenatal.drop(prenatal[prenatal['DELWKSGT'].isnull()].index, inplace=True)
prenatal.drop(prenatal[prenatal['PNV_Total_Number'].isnull()].index, inplace=True)


insuranceMap = {1: 'MedicalAssistance',
                2: 'PrivateInsurance',
                3: 'Self-pay'}

momi['DFC'] = momi['DFC'].map(insuranceMap)

outcomeMap = {1: 'SingleStillborn',
              2: 'TwinsLiveborn',
              3: 'TwinsOneLive',
              4: 'TwinsStillborn',
              5: 'MultsLiveborn',
              6: 'OtherMultSomeLive',
              7: 'OtherMultStillborn',
              9: np.NaN,
              10: 'SingleLiveborn'}

momi['MMULGSTD'] = momi['MMULGSTD'].map(outcomeMap)

neurMuscDiseaseMap = {0: 'None',
                      1: 'Multiple Sclerosis',
                      2: 'Cerebal Palsy',
                      3: 'Myotonic Dystrophy',
                      4: 'Myasthenia Gravis',
                      5: 'Other'}

momi['MCNSMUSC'] = momi['MCNSMUSC'].map(neurMuscDiseaseMap)

collagenVascMap = {0: 'None',
                   1: 'Rhematoid Arthritis',
                   2: 'Lupus',
                   8: 'Multiple Diagnostic Codes'}

momi['MCOLVASC'] = momi['MCOLVASC'].map(collagenVascMap)

struHeartMap = {0: 'None',
                1: 'Congenital Heart Disease',
                2: 'Rheumatic Heart Disease',
                3: 'Myocarditis/Cardiomyopathy',
                4: 'ValveDisorder',
                5: 'ArtificialValves',
                9: 'Other'}

momi['MCVDANAT'] = momi['MCVDANAT'].map(struHeartMap)

postpartMap = {0: 'None',
               1: 'Endometritis',
               2: 'UrinaryTractInfection',
               3: 'Hemmorrage',
               4: 'WoundInfection',
               5: 'Disseminated',
               6: 'Obstruction',
               9: 'Other'}

momi['MDELCOMP'] = momi['MDELCOMP'].map(postpartMap)

diabetesMap = {0: 'None',
               1: 'GestationalDiabetes',
               2: 'TypeI',
               3: 'TypeII',
               4: 'UnspecifiedPriorDiabetes'}

momi['MENDDIAB'] = momi['MENDDIAB'].map(diabetesMap)

thyroidMap = {0: 'None',
              1: 'Hyperthyroid',
              2: 'Hypothyroid',
              9: 'Other'}

momi['MENDTHY'] = momi['MENDTHY'].map(thyroidMap)

liverGallMap = {0: 'None',
                1: 'HepA',
                2: 'HepB',
                3: 'HepC',
                4: 'HepD',
                5: 'HepE',
                6: 'LiverTransplant',
                7: 'Cholelithiasis',
                8: 'Pancreatitis',
                9: 'Other'}

momi['MGILGBP'] = momi['MGILGBP'].map(liverGallMap)

kidneyMap = {0: 'None',
             1: 'Glomerulonephritis',
             2: 'Pyelonephritis;',
             3: 'LupusNephritis',
             4: 'NephroticSyndrome',
             5: 'Nephrolithiasis',
             6: 'Transplant;',
             7: 'RenalAbscess',
             8: 'MultipleDiagnosticCodes',
             9: 'Other'}

momi['MGURENAL'] = momi['MGURENAL'].map(kidneyMap)

anemiaMap = {0: 'None',
             1: 'IronDeficiencyAnemia',
             2: 'B12DeficiencyAnemia',
             3: 'FolateDeficiencyAnemia',
             9: 'UnspecifiedAnemia'}

momi['MHEMANEM'] = momi['MHEMANEM'].map(anemiaMap)

hemoGlob = {0: 'None',
            1: 'Hgb-SS',
            2: 'Hgb-SC',
            3: 'Hgb-Sthal',
            4: 'AlphaThalassemia',
            5: 'BetaThalassemia',
            6: 'SickleCellTrait',
            9: 'Other'}

momi['MHEMHGB'] = momi['MHEMHGB'].map(hemoGlob)

thromMap = {0: 'None',
            1: 'Gestational',
            2: 'DisseminatedIntravascularCoagulation',
            3: 'MultipleDiagnosticCodes',
            9: 'Other'}

momi['MHEMPLT'] = momi['MHEMPLT'].map(thromMap)

viralMap = {0: 'None',
            1: 'PrimaryCMV',
            2: 'ParovirusB19',
            3: 'Rubella',
            4: 'Toxoplasma',
            5: np.NaN,
            8: 'MultipleDiagnosticCodes',
            9: 'Other'}

momi['MIDVIRPR'] = momi['MIDVIRPR'].map(viralMap)

substanceMap = {0: 'None',
                1: 'Stimulants',
                2: 'Sedatives/Hypnotics/Anxiolytics',
                3: 'Anti-depressants/OtherPsychoactives',
                4: 'Hallucinogens',
                6: 'Alcohol',
                8: 'MultipleDiagnosticCodes',
                9: 'Other'}

momi['MTOXOTHR'] = momi['MTOXOTHR'].map(substanceMap)

anoAnoMap = {0: 'None',
             1: 'Anencephaly/Similar',
             2: 'Encephalocele',
             3: 'Microcephaly',
             4: 'CongenitalHydrocephalus',
             5: 'SpinaBifida',
             8: 'MultipleDiagnosticCodes',
             0: 'OtherCongenital'}

momi['ICNSANAT'] = momi['ICNSANAT'].map(anoAnoMap)



# Ordinal Encoding Education
education_map = {'8th grade or less': 1,
                 '9th-12th grade, no diploma': 2,
                 'High school graduate or GED completed': 3,
                 'Some college credit, no degree': 4,
                 'Associate degree': 5,
                 "Bachelor's degree": 6,
                 "Master's degree": 7,
                 'Doctorate or professional degree': 8,
                 'Doctorate or Professional degree': 8}

momi['DadEducation_State'] = momi['DadEducation_State'].map(education_map)

momi['MomEducation_State'] = momi['MomEducation_State'].map(education_map)

# Renaming Race variables for easier comparison
raceMap = {'B': 'AfricanAmerican', 'C': "Chinese", 'D': "Declined",
           'E': "OtherAsian", 'F': "Filipino", 'G': "Guam/Chamorro",
           'I': "Indian(Asian)", 'J': "Japanese", 'K': "Korean",
           'L': "AlaskanNative", 'N': "NativeAmerican", 'P': "OtherPacificIslander",
           'Q': "Hawaiian", 'S': "Samoan", 'V': "Vietnamese", 'W': "White", 'D': "Declined", 9: np.NaN}

momi['Race'] = momi['Race'].map(raceMap)

# Collapsing Race categories
momi['RaceCollapsed'] = np.NaN

AsianGroups = ['OtherAsian', 'Indian(Asian)', 'Chinese', 'Korean', 'Filipino', 'Japanese', 'Vietnamese']
Polynesian = ['Hawaiian', 'Samoan', 'OtherPacificIslander', 'Guam/Chamorro']  # Unsure about Guam
NativeGroups = ['NativeAmerican', 'AlaskanNative']

# Asian
momi['RaceCollapsed'] = np.where((momi['Race'].isin(AsianGroups)), 'Asian', momi['RaceCollapsed'])
# Polynesian
momi['RaceCollapsed'] = np.where((momi['Race'].isin(Polynesian)), 'Polynesian', momi['RaceCollapsed'])
# Native
momi['RaceCollapsed'] = np.where((momi['Race'].isin(NativeGroups)), 'Native', momi['RaceCollapsed'])
# African
momi['RaceCollapsed'] = np.where((momi['Race'] == 'AfricanAmerican'), 'African', momi['RaceCollapsed'])
# White
momi['RaceCollapsed'] = np.where((momi['Race'] == 'White'), 'White', momi['RaceCollapsed'])

# Renaming Hypertensive variables for easier comparison
hypMap = {0: 'None', 1: 'TransientHypertension',
          2: 'Preeclampsia mild', 3: 'PreeclampsiaSevere',
          5: 'Eclampsia', 6: 'ChronicHypwPre',
          8: 'MultipleDiagnosticCodes', 9: 'UnspecifiedHyp'}

momi['MOBHTN'] = momi['MOBHTN'].map(hypMap)

# Set mildpe to 0 if marked severe
momi['Mild_PE'] = np.where(momi['MOBHTN'] == 'PreeclampsiaSevere', 0, momi['Mild_PE'])

# Looking at any occurance of Preeclampsia/Eclampsia
momi['Preeclampsia/Eclampsia'] = np.NaN
momi['Preeclampsia/Eclampsia'] = np.where(
    (momi['Mild_PE'] == 1) | (momi['Severe_PE'] == 1) | (momi['SIPE'] == 1) | (momi['MOBHTN'] == 'Eclampsia'), 1, 0)

# Renaming columns for easier analysis
momi.rename(columns={"DMOMAGE": "MotherAge", "FatherAge_State": "FatherAge", "DFC": "Insurance",
                     "DELWKSGT": "GestAgeDelivery", "MHXGRAV": "TotalNumPregnancies",
                     "MHXPARA": "DeliveriesPriorAdmission",
                     "MHXABORT": "TotalAbortions", "PRIMIP": "Primagrivada", "DMOMHGT": "MaternalHeightMeters",
                     "MOBRPWT": "PrePregWeight", "MOBADMWT": "WeightAtAdmission",
                     "FOBLABHR": "HoursLaborToDelivery",
                     "FOBROMHR": "HoursMembraneReptureDelivery", "CSREPEAT": "RepeatCesarean",
                     "FDELTYPE": "DeliveryMethod",
                     "MMULGSTD": "OutcomeOfDelivery", "FOBDEATH": "FetalDeath",
                     "MCNSMUSC": "MaternalNeuromuscularDisease",
                     "MCOLVASC": "MCollagenVascularDisease", "MCVDANAT": "MStructuralHeartDiseas",
                     "MCVDHTN": "ChronicHypertension",
                     "MOBHTN": "PregRelatedHypertension", "MDELCOMP": "MPostPartumComplications",
                     "MDEPRESS": "Depression",
                     "MENDDIAB": "DiabetesMellitus", "MENDTHY": "ThyroidDisease",
                     "MGIHYPER": "HyperemesisGravidarum",
                     "MGILGBP": "MLiverGallPanc", "MGUINFER": "HistoryInfertility", "MGURENAL": "KidneyDisease",
                     "MHEARTOPER": "OperationsOnHeartandPericardium", "MHEMANEM": "MAnemiaWOHemoglobinopathy",
                     "MHEMHGB": "MHemoglobinopathy", "MHEMPLT": "Thrombocytopenia", "MHEMTRAN": "TransfusionOfPRBC",
                     "MIDBV": "BacterialVaginosis", "MIDCHLAM": "Chlamydia", "MIDCONDY": "Condylomata",
                     "MIDGBS": "GroupBStrep", "MIDGC": "GonococcalInfection", "MIDHEPB": "HepBInfection",
                     "MIDHSV": "Herpes", "MIDTB": "Tuberculosis", "MIDTRICH": "Trichomonas",
                     "MIDVIRPR": "ViralOrProtoInf",
                     "MINTERINJ": "ThoraxAbPelvInjuries", "MMORTECLAMP": "Eclampsia",
                     "MMORTHEARTFAIL": "HeartFailure",
                     "MMORTRENAL": "AcuteRenalFailure", "MMORTSICKLECELL": "SickleCell",
                     "MOBPRECS": "PreviousCesarean",
                     "MPULASTH": "Asthma", "MTOXCOC": "Cocaine", "MTOXNARC": "Opioid",
                     "MTOXOTHR": "OtherSubstanceAbuse",
                     "MTOXTHC": "Marijuana", "IDEMBWT": "InfantWeightGrams", "IGROWTH": "GestWeightCompare",
                     "ICNSANAT": "CNSAbnormality", "IIDSYPH": "CongenitalSyphilis", "IIDUTI": "UTI",
                     "Alcohol_a": 'Drinks/Week'}, inplace=True)

# Dropping variables with more than 20% missing values
momi = momi.loc[:, momi.isnull().mean() < .20]

# Joining the momi data with the prenatal data - we want women who never had preeclampsia and first incidence of
# preeclampsia, nothing else
# Step 1, split systolic and diastolic
new = prenatal["PNV_BP"].str.split("/", n=1, expand=True)
prenatal["Systolic"] = new[0]
prenatal["Diastolic"] = new[1]
prenatal[["Systolic", "Diastolic"]] = prenatal[["Systolic", "Diastolic"]].apply(pd.to_numeric)

In [138]:
# MAP = (Sys + (2*Dias))/3
prenatal['MAP'] = np.NaN
prenatal['MAP'] = (prenatal['Systolic'] + (2* prenatal['Diastolic']))/3

In [139]:
prenatal[['Systolic','Diastolic','MAP']].head()

Unnamed: 0,Systolic,Diastolic,MAP
0,102.0,70.0,80.666667
1,120.0,60.0,80.0
2,110.0,66.0,80.666667
3,110.0,70.0,83.333333
4,110.0,70.0,83.333333


In [76]:
# Step 2, make indicator variable
prenatal['High'] = np.where((prenatal['Systolic'] >= 130) | (prenatal['Diastolic'] >= 80), 1, 0)

# Step 3, make a cumulative sum to count how many times this person has had spikes
prenatal['Prev_highBP'] = prenatal.groupby(['MOMI_ID', 'Delivery_Number_Per_Mother'])['High'].cumsum().astype(int)

# Drop all women under 14 weeks from prenatal data
prenatal.drop(prenatal.loc[prenatal['PNV_GestAge'] > 14].index, inplace=True)
momi.sort_values('MOMI_ID', inplace=True)
uniquePregMomi = momi.drop_duplicates(subset=['MOMI_ID', 'Delivery_Number_Per_Mother'], keep='last')

prenatal.sort_values('PNV_GestAge', ascending=False, inplace=True)  # For preferenceing high bp
uniquePregPrenatal = prenatal.drop_duplicates(subset=['MOMI_ID', 'Delivery_Number_Per_Mother'], keep='first')
join = pd.merge(uniquePregMomi, uniquePregPrenatal, how='right')

# Removes duplicates, keeping only instances with Preeclampsia
join.sort_values('Preeclampsia/Eclampsia', ascending=False, inplace=True)
join = join.drop_duplicates(subset=['MOMI_ID'], keep='first')

# Ultrasound join

In [133]:
ultrasound.sort_values(by=['DeliveryYear', 'USGestAge'])
#ultrasound = ultrasound.loc[ultrasound['USGestAge']<20]
ultrasound = ultrasound.copy()
ultrasound.drop_duplicates(subset=['MOMI_ID', 'Delivery_Number_Per_Mother'], inplace=True)
locations = ['USPlacLoc_1', 'USPlacLoc_2','USPlacLoc_3', 'USPlacLoc_4', 'USPlacLoc_5', 'USPlacLoc_6']

In [134]:
# Step one: create indicator columns
ultrasound['Loc_Anterior'] = 0
ultrasound['Loc_Posterior'] = 0
ultrasound['Loc_LeftLateral'] = 0
ultrasound['Loc_RightLateral'] = 0
ultrasound['Loc_Fundal'] = 0
ultrasound['Loc_Previa'] = 0
ultrasound['Loc_LowLying'] = 0
ultrasound['Loc_NonDiagnostic'] = 0

# For each location column: 
for col in locations:
    #Check what the position is and mark the appropriate binary column
    ultrasound['Loc_Anterior'] = np.where(((ultrasound[col] ==1)), 1,ultrasound['Loc_Anterior'])
    ultrasound['Loc_Posterior'] = np.where(((ultrasound[col] ==2)), 1,ultrasound['Loc_Posterior'])
    ultrasound['Loc_LeftLateral'] = np.where(((ultrasound[col] ==3)), 1,ultrasound['Loc_LeftLateral'])
    ultrasound['Loc_RightLateral'] = np.where(((ultrasound[col] ==4)), 1,ultrasound['Loc_RightLateral'])
    ultrasound['Loc_Fundal'] = np.where(((ultrasound[col] ==5)), 1,ultrasound['Loc_Fundal'])
    ultrasound['Loc_Previa'] = np.where(((ultrasound[col] ==6)), 1,ultrasound['Loc_Previa'])
    ultrasound['Loc_LowLying'] = np.where(((ultrasound[col] ==7)), 1,ultrasound['Loc_LowLying'])
    ultrasound['Loc_NonDiagnostic'] = np.where(((ultrasound[col] ==8)), 1,ultrasound['Loc_NonDiagnostic'])

ultrasound.drop(columns=locations, inplace=True)

In [119]:
ultrasound.columns

Index(['MOMI_ID', 'DeliveryYear', 'Delivery_Number_Per_Mother', 'DELWKSGT',
       'MMULGSTD', 'SINGLETON', 'USExamCategory', 'USExamType', 'USGestAge',
       'USGestAgeType', 'FetusNumber', 'Loc_Anterior', 'Loc_Posterior',
       'Loc_LeftLateral', 'Loc_RightLateral', 'Loc_Fundal', 'Loc_Previa',
       'Loc_LowLying', 'Loc_NonDiagnostic'],
      dtype='object')

## Testing merge

In [135]:
testMerge = pd.merge(join,ultrasound[['MOMI_ID', 'Delivery_Number_Per_Mother', 'Loc_Anterior', 'Loc_Posterior',
       'Loc_LeftLateral', 'Loc_RightLateral', 'Loc_Fundal', 'Loc_Previa',
       'Loc_LowLying', 'Loc_NonDiagnostic']],on=['MOMI_ID', 'Delivery_Number_Per_Mother'], how='left')

In [130]:
testMerge

Unnamed: 0,MOMI_ID,Delivery_Number_Per_Mother,DeliveryYear,Has_Prenatal_Data,Has_Ultrasound_PlacLoc,MotherAge,Race,Insurance,GestAgeDelivery,TotalNumPregnancies,...,High,Prev_highBP,Loc_Anterior,Loc_Posterior,Loc_LeftLateral,Loc_RightLateral,Loc_Fundal,Loc_Previa,Loc_LowLying,Loc_NonDiagnostic
0,62316153991994,1,2020,1,1,25.0,White,PrivateInsurance,37.857143,1.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,11834913991992,1,2020,1,1,28.0,White,PrivateInsurance,37.000000,2.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,61872367991976,1,2015,1,1,38.0,,PrivateInsurance,40.714286,1.0,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,54821479991981,1,2017,1,0,35.0,White,MedicalAssistance,33.714286,7.0,...,0,0,,,,,,,,
4,99311275001986,1,2019,1,1,32.0,AfricanAmerican,MedicalAssistance,36.142857,4.0,...,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31426,35271662991981,2,2016,1,0,35.0,White,PrivateInsurance,37.571429,2.0,...,0,1,,,,,,,,
31427,16456891991988,1,2020,1,1,31.0,Korean,PrivateInsurance,39.571429,2.0,...,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
31428,25224329991981,2,2015,1,0,33.0,White,PrivateInsurance,33.571429,3.0,...,0,0,,,,,,,,
31429,54694415991986,1,2020,1,1,34.0,White,MedicalAssistance,35.571429,2.0,...,1,1,,,,,,,,


In [136]:
testMerge.iloc[:,94:102].isnull().sum()

Loc_Anterior         9414
Loc_Posterior        9414
Loc_LeftLateral      9414
Loc_RightLateral     9414
Loc_Fundal           9414
Loc_Previa           9414
Loc_LowLying         9414
Loc_NonDiagnostic    9414
dtype: int64

In [105]:
18226/31431

0.5798733734211448

In [110]:
testMerge.columns

Index(['MOMI_ID', 'Delivery_Number_Per_Mother', 'DeliveryYear',
       'Has_Prenatal_Data', 'Has_Ultrasound_PlacLoc', 'MotherAge', 'Race',
       'Insurance', 'GestAgeDelivery', 'TotalNumPregnancies',
       ...
       'High', 'Prev_highBP', 'Loc_Anterior', 'Loc_Posterior',
       'Loc_LeftLateral', 'Loc_RightLateral', 'Loc_Fundal', 'Loc_Previa',
       'Loc_LowLying', 'Loc_NonDiagnostic'],
      dtype='object', length=102)

In [111]:
columns = ['Loc_Anterior', 'Loc_Posterior',
       'Loc_LeftLateral', 'Loc_RightLateral', 'Loc_Fundal', 'Loc_Previa',
       'Loc_LowLying', 'Loc_NonDiagnostic']

In [82]:
join.shape

(31431, 94)

Now I just need to deal with the multiples

In [40]:
# Drop all women under 14 weeks from prenatal data
USTest.drop(USTest.loc[USTest['USGestAge'] > 14].index, inplace=True)
USTest.sort_values(by='MOMI_ID')
#USTest.loc[USTest.duplicated(subset=['MOMI_ID','Delivery_Number_Per_Mother'])]
USTest

Unnamed: 0,MOMI_ID,DeliveryYear,Delivery_Number_Per_Mother,DELWKSGT,MMULGSTD,SINGLETON,USExamCategory,USExamType,USGestAge,USGestAgeType,FetusNumber,Loc_Anterior,Loc_Posterior,Loc_LeftLateral,Loc_RightLateral,Loc_Fundal,Loc_Previa,Loc_LowLying,Loc_NonDiagnostic
2,81155213991973,2011,1,37.000000,10,1,Obstetric,FirstTrimester,9.7,LMP,1,0,0,0,0,0,0,0,1
3,81155213991973,2011,1,37.000000,10,1,Obstetric,FirstTrimester,13.1,LMP,1,0,0,0,1,1,0,0,0
6,98174825991981,2014,2,38.000000,10,1,Obstetric,FirstTrimester,13.6,LMP,1,0,1,0,0,0,0,0,0
9,76494213991976,2012,1,30.000000,10,1,Obstetric,FirstTrimester,11.4,LMP,1,1,0,0,0,0,0,0,0
20,64227234991978,2012,1,39.000000,10,1,Obstetric,FirstTrimester,13.4,LMP,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221183,76739537991993,2020,1,21.285714,1,1,Obstetric,FirstTrimester,12.4,LMP,1,0,1,0,0,0,0,0,0
221248,58515446991985,2020,1,25.285714,10,1,Obstetric,FirstTrimester,12.3,Crown Rump,1,1,0,0,0,0,0,0,0
221249,58515446991985,2020,1,25.285714,10,1,Obstetric,FirstTrimester,13.1,PriorExam,1,1,0,0,0,0,0,0,0
221260,57247562991987,2016,1,39.714286,10,1,Obstetric,FirstTrimester,6.1,Crown Rump,1,0,0,0,0,0,0,0,1


In [41]:
USTest.loc[USTest['FetusNumber']!= 1]

Unnamed: 0,MOMI_ID,DeliveryYear,Delivery_Number_Per_Mother,DELWKSGT,MMULGSTD,SINGLETON,USExamCategory,USExamType,USGestAge,USGestAgeType,FetusNumber,Loc_Anterior,Loc_Posterior,Loc_LeftLateral,Loc_RightLateral,Loc_Fundal,Loc_Previa,Loc_LowLying,Loc_NonDiagnostic
23,91911455991972,2011,1,39.000000,2,0,Obstetric,FirstTrimester,11.9,Conception,2,1,0,0,0,0,0,0,0
158,43522444991979,2014,3,38.000000,2,0,Obstetric,FirstTrimester,13.4,LMP,2,0,1,0,0,0,0,0,0
189,14894948991980,2015,1,24.285714,2,0,Obstetric,FirstTrimester,7.1,LMP,2,0,0,0,0,0,0,0,1
191,14894948991980,2015,1,24.285714,2,0,Obstetric,FirstTrimester,11.6,LMP,2,0,1,0,0,0,0,0,0
392,68346721991982,2015,3,29.285714,2,0,Obstetric,FirstTrimester,7.7,LMP,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220222,42931443991996,2020,1,37.142857,2,0,Obstetric,FirstTrimester,9.0,LMP,2,0,1,0,0,0,0,0,0
220820,83844221991992,2020,1,24.714286,2,0,Obstetric,FirstTrimester,10.4,LMP,2,0,0,0,0,0,0,0,1
221007,74661382991987,2020,1,21.428571,10,1,Obstetric,FirstTrimester,5.9,Composite,2,0,0,0,0,0,0,0,1
221032,86937822991988,2020,1,29.857143,2,0,Obstetric,FirstTrimester,10.9,LMP,2,0,1,0,0,0,0,0,0


In [42]:
USTest.loc[USTest['MOMI_ID']== 43522444991979]

Unnamed: 0,MOMI_ID,DeliveryYear,Delivery_Number_Per_Mother,DELWKSGT,MMULGSTD,SINGLETON,USExamCategory,USExamType,USGestAge,USGestAgeType,FetusNumber,Loc_Anterior,Loc_Posterior,Loc_LeftLateral,Loc_RightLateral,Loc_Fundal,Loc_Previa,Loc_LowLying,Loc_NonDiagnostic
156,43522444991979,2011,2,38.0,10,1,Obstetric,FirstTrimester,13.6,Crown Rump,1,0,1,0,0,0,0,0,0
157,43522444991979,2014,3,38.0,2,0,Obstetric,FirstTrimester,13.4,LMP,1,0,1,0,0,0,0,0,0
158,43522444991979,2014,3,38.0,2,0,Obstetric,FirstTrimester,13.4,LMP,2,0,1,0,0,0,0,0,0


So we'll need to single out the non-singletons

In [43]:
USTest.loc[USTest['SINGLETON']==0]

Unnamed: 0,MOMI_ID,DeliveryYear,Delivery_Number_Per_Mother,DELWKSGT,MMULGSTD,SINGLETON,USExamCategory,USExamType,USGestAge,USGestAgeType,FetusNumber,Loc_Anterior,Loc_Posterior,Loc_LeftLateral,Loc_RightLateral,Loc_Fundal,Loc_Previa,Loc_LowLying,Loc_NonDiagnostic
22,91911455991972,2011,1,39.000000,2,0,Obstetric,FirstTrimester,11.9,Conception,1,1,0,0,0,0,0,0,0
23,91911455991972,2011,1,39.000000,2,0,Obstetric,FirstTrimester,11.9,Conception,2,1,0,0,0,0,0,0,0
157,43522444991979,2014,3,38.000000,2,0,Obstetric,FirstTrimester,13.4,LMP,1,0,1,0,0,0,0,0,0
158,43522444991979,2014,3,38.000000,2,0,Obstetric,FirstTrimester,13.4,LMP,2,0,1,0,0,0,0,0,0
188,14894948991980,2015,1,24.285714,2,0,Obstetric,FirstTrimester,7.1,LMP,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220820,83844221991992,2020,1,24.714286,2,0,Obstetric,FirstTrimester,10.4,LMP,2,0,0,0,0,0,0,0,1
221031,86937822991988,2020,1,29.857143,2,0,Obstetric,FirstTrimester,10.9,LMP,1,0,1,0,0,0,0,0,0
221032,86937822991988,2020,1,29.857143,2,0,Obstetric,FirstTrimester,10.9,LMP,2,0,1,0,0,0,0,0,0
221109,24154633991992,2020,1,34.714286,2,0,Obstetric,FirstTrimester,10.1,PatientEDC,1,1,0,0,0,0,0,0,0


They look like they might all be in the same locations. How do I check? 

In [53]:
USTest.sort_values(by=['MOMI_ID','Delivery_Number_Per_Mother'])
USTest.loc[USTest['SINGLETON']==0].groupby(['MOMI_ID','Delivery_Number_Per_Mother', 'USGestAge', 'FetusNumber'])['Loc_Anterior', 'Loc_Posterior',
       'Loc_LeftLateral', 'Loc_RightLateral', 'Loc_Fundal', 'Loc_Previa',
       'Loc_LowLying', 'Loc_NonDiagnostic'].sum()

  USTest.loc[USTest['SINGLETON']==0].groupby(['MOMI_ID','Delivery_Number_Per_Mother', 'USGestAge', 'FetusNumber'])['Loc_Anterior', 'Loc_Posterior',


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Loc_Anterior,Loc_Posterior,Loc_LeftLateral,Loc_RightLateral,Loc_Fundal,Loc_Previa,Loc_LowLying,Loc_NonDiagnostic
MOMI_ID,Delivery_Number_Per_Mother,USGestAge,FetusNumber,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11144674991986,1,11.9,1,0,1,0,0,0,0,0,0
11144674991986,1,11.9,2,0,1,0,0,0,0,0,0
11154913991985,3,11.9,1,1,0,1,0,0,0,0,0
11154913991985,3,11.9,2,1,0,0,0,0,0,0,0
11173443991989,1,9.2,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
99553926991987,2,11.7,2,0,1,0,0,0,0,0,0
99828932991981,1,9.3,1,1,0,0,0,0,0,0,0
99828932991981,1,9.3,2,0,1,0,0,1,0,0,0
99828932991981,1,12.3,1,1,0,0,0,0,0,0,0


In [54]:
multTest = ultrasound.copy()

In [65]:
multTest.sort_values(by=['USGestAge'])
multTest.drop(multTest.loc[multTest['USGestAge'] > 14].index, inplace=True)

In [57]:
multTest=multTest.loc[multTest['SINGLETON']==0]

In [66]:
multTest

Unnamed: 0,MOMI_ID,DeliveryYear,Delivery_Number_Per_Mother,DELWKSGT,MMULGSTD,SINGLETON,USExamCategory,USExamType,USGestAge,USGestAgeType,FetusNumber,USPlacLoc_1,USPlacLoc_2,USPlacLoc_3,USPlacLoc_4,USPlacLoc_5,USPlacLoc_6
22,91911455991972,2011,1,39.000000,2,0,Obstetric,FirstTrimester,11.9,Conception,1,1,,,,,
23,91911455991972,2011,1,39.000000,2,0,Obstetric,FirstTrimester,11.9,Conception,2,1,,,,,
157,43522444991979,2014,3,38.000000,2,0,Obstetric,FirstTrimester,13.4,LMP,1,2,,,,,
158,43522444991979,2014,3,38.000000,2,0,Obstetric,FirstTrimester,13.4,LMP,2,2,,,,,
188,14894948991980,2015,1,24.285714,2,0,Obstetric,FirstTrimester,7.1,LMP,1,8,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220820,83844221991992,2020,1,24.714286,2,0,Obstetric,FirstTrimester,10.4,LMP,2,8,,,,,
221031,86937822991988,2020,1,29.857143,2,0,Obstetric,FirstTrimester,10.9,LMP,1,2,,,,,
221032,86937822991988,2020,1,29.857143,2,0,Obstetric,FirstTrimester,10.9,LMP,2,2,,,,,
221109,24154633991992,2020,1,34.714286,2,0,Obstetric,FirstTrimester,10.1,PatientEDC,1,1,,,,,
