In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display

data_dir = "../data/erin_data"

pd.set_option("display.max_columns", None)

# read in GA data
ga_data_path = os.path.join(data_dir, "ga_data.csv")
ga_data = pd.read_csv(ga_data_path)
ga_data.replace([" "], np.nan, inplace=True)

# weight variable is numeric but stored as object - convert to float
ga_data["dgaWeight"] = ga_data.dgaWeight.astype("float")

## Error with dgaGA cols:

for some reason "dgaGAType" , "dgaGATotal", and "dgaGASequence" have been converted from their respective integer values to take the values "Extracted": 1, "Filled": 2, "Sealed": 3

convert them back to correct values - dgaGAType maps to 1 = exodontia, 2 = comprehensive care - other two are integer fields

In [2]:
correct_tot_seq_vals = {
    "Extracted": 1,
    "Filled": 2,
    "Sealed": 3,
}

ga_data["dgaGATotal"] = ga_data["dgaGATotal"].map(correct_tot_seq_vals)
ga_data["dgaGATotal"] = ga_data["dgaGATotal"].astype("int")
ga_data["dgaGASequence"] = ga_data["dgaGASequence"].map(correct_tot_seq_vals)
ga_data["dgaGASequence"] = ga_data["dgaGASequence"].astype("int")

correct_type_vals = {
    "Extracted" : "exodontia",
    "Filled" : "comprehensive_care"
}

ga_data["dgaGAType"] = ga_data.dgaGAType.map(correct_type_vals)
ga_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1088 entries, 0 to 1087
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ChildID            1088 non-null   int64  
 1   AgeMths            1088 non-null   int64  
 2   PregnancyID        1088 non-null   object 
 3   MotherID           1088 non-null   int64  
 4   FatherID           219 non-null    float64
 5   dgaGAType          1088 non-null   object 
 6   dgaGATotal         1088 non-null   int64  
 7   dgaGASequence      1088 non-null   int64  
 8   dgaWeight          915 non-null    float64
 9   dgaUR6             1087 non-null   object 
 10  dgaURE             1087 non-null   object 
 11  dgaURD             1085 non-null   object 
 12  dgaURC             1088 non-null   object 
 13  dgaURB             1088 non-null   object 
 14  dgaURA             1085 non-null   object 
 15  dgaULA             1086 non-null   object 
 16  dgaULB             1087 

In [3]:
# read in epi data
epi_data_path = os.path.join(data_dir, "epi_data.csv")
epi_data = pd.read_csv(epi_data_path)
epi_data.replace([" "], np.nan, inplace=True)
epi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ChildID            354 non-null    int64  
 1   PregnancyID        354 non-null    object 
 2   MotherID           354 non-null    int64  
 3   FatherID           111 non-null    float64
 4   AgeMths            354 non-null    int64  
 5   ddsdt              354 non-null    int64  
 6   ddsmt              354 non-null    int64  
 7   ddsft              354 non-null    int64  
 8   ddsdmft            354 non-null    int64  
 9   ddscareindex       354 non-null    int64  
 10  res6mcage          354 non-null    int64  
 11  res6mcbetterstart  354 non-null    object 
dtypes: float64(1), int64(9), object(2)
memory usage: 33.3+ KB


In [4]:
# read in merged data
merge_data_path = os.path.join(data_dir, "epi_ga_merge_data.csv")
merge_data = pd.read_csv(merge_data_path)
merge_data.replace([" "], np.nan, inplace=True)

# for some reason first two entries are blank
merge_data = merge_data.iloc[2:]

merge_data["ChildID"] = merge_data.ChildID.astype("int")

type_keys_to_vals = {
    1 : "exodontia",
    2 : "comprehensive_care"
}
merge_data["dgaGAType"] = merge_data.dgaGAType.astype("float")
merge_data["dgaGAType"] = merge_data.dgaGAType.map(type_keys_to_vals)

merge_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1416 entries, 2 to 1417
Data columns (total 77 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ChildID                            1416 non-null   int64  
 1   PregnancyID                        1399 non-null   object 
 2   MotherID                           1399 non-null   float64
 3   FatherID                           318 non-null    float64
 4   admincgender                       1399 non-null   object 
 5   agefm_fbqall                       311 non-null    float64
 6   agemm_mbqall                       1126 non-null   float64
 7   ben0mentst                         1122 non-null   object 
 8   bib6a02                            139 non-null    object 
 9   bib6b04                            139 non-null    object 
 10  bib6b05                            141 non-null    object 
 11  deminfeth3gpcomb                   1357 non-null   objec

## Age Features

Lots of strangeness with age features: `AgeMths` and `res6mcage`

`AgeMths` - no explicit description given. wrt GA data seems to be the age at the time of GA (from looking at the data). WRT Epi data, values are either 65 or 66 only - very odd given epi kids are a sample of 5 year olds so we would expect the ages to range from 60 to 71 months. In merged dataset all GA only entries are Nan! - looks like an error in merge.

`res6mcage` - supposed to be "age in months of child at Epi". WRT Epi data, values are either 60 or 66 - this makes a bit more sense if granularity of measurement is 6 month or 1/2 year increments - so kids are either 5 or 5.5 y/o. WRT GA data, description makes no sense - all kids have an entry in this field despite only 20 or so appearing in the Epi samples. Entries are withing throwing distance of AgeMths but don't agree, even adjusting for 6month increments.

For now - merge AgeMths from GA data and delete entries from Epi data - not much info in there anyways:

In [5]:
# merge AgeMths col from GA data into merged data
merge_data.drop("AgeMths", axis=1, inplace=True)
key_cols = ["ChildID", "dgaGASequence"]
merge_data = merge_data.merge(ga_data[key_cols + ["AgeMths"]],
                              how="left",
                              on=key_cols)

## duplicate entries

there are equvalent duplicated entries for individual children in GA data and merged data:


In [6]:
from IPython.display import display

three_ops = ga_data.dgaGATotal == 3
three_op_ids = ga_data[three_ops].ChildID.unique()

for id in three_op_ids[:2]:
    print("GA Entries:")
    display(ga_data[ga_data.ChildID == id]
            .sort_values(["dgaGASequence"])) #%%
    print("Merged Entries:")
    display(merge_data.loc[merge_data.ChildID == id, ga_data.columns]
            .sort_values(["dgaGASequence"])) #%%

GA Entries:


Unnamed: 0,ChildID,AgeMths,PregnancyID,MotherID,FatherID,dgaGAType,dgaGATotal,dgaGASequence,dgaWeight,dgaUR6,dgaURE,dgaURD,dgaURC,dgaURB,dgaURA,dgaULA,dgaULB,dgaULC,dgaULD,dgaULE,dgaUL6,dgaLL6,dgaLLE,dgaLLD,dgaLLC,dgaLLB,dgaLLA,dgaLRA,dgaLRB,dgaLRC,dgaLRD,dgaLRE,dgaLR6,dgaUR4,dgaUR1,dgaUL1,dgaUL4,dgaLL5,dgaLL4,dgaLL1,dgaLR1,dgaLR4,dgaLR5,res6mcage,res6mcbetterstart
0,12465,57,10328P1,310,,comprehensive_care,3,1,,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,,,,,,,,,,,24,Yes
2,12465,57,10328P1,310,,comprehensive_care,3,2,,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,,,,,,,,,,,24,Yes
1,12465,97,10328P1,310,,comprehensive_care,3,3,39.0,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,,,,,,,,,,,96,Yes


Merged Entries:


Unnamed: 0,ChildID,AgeMths,PregnancyID,MotherID,FatherID,dgaGAType,dgaGATotal,dgaGASequence,dgaWeight,dgaUR6,dgaURE,dgaURD,dgaURC,dgaURB,dgaURA,dgaULA,dgaULB,dgaULC,dgaULD,dgaULE,dgaUL6,dgaLL6,dgaLLE,dgaLLD,dgaLLC,dgaLLB,dgaLLA,dgaLRA,dgaLRB,dgaLRC,dgaLRD,dgaLRE,dgaLR6,dgaUR4,dgaUR1,dgaUL1,dgaUL4,dgaLL5,dgaLL4,dgaLL1,dgaLR1,dgaLR4,dgaLR5,res6mcage,res6mcbetterstart
1065,12465,57.0,10328P1,310.0,,comprehensive_care,3.0,1.0,,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,,,,,,,,,,,,
1076,12465,57.0,,,,comprehensive_care,3.0,2.0,,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,,,,,,,,,,,,
1082,12465,97.0,,,,comprehensive_care,3.0,3.0,39.0,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,,,,,,,,,,,,


GA Entries:


Unnamed: 0,ChildID,AgeMths,PregnancyID,MotherID,FatherID,dgaGAType,dgaGATotal,dgaGASequence,dgaWeight,dgaUR6,dgaURE,dgaURD,dgaURC,dgaURB,dgaURA,dgaULA,dgaULB,dgaULC,dgaULD,dgaULE,dgaUL6,dgaLL6,dgaLLE,dgaLLD,dgaLLC,dgaLLB,dgaLLA,dgaLRA,dgaLRB,dgaLRC,dgaLRD,dgaLRE,dgaLR6,dgaUR4,dgaUR1,dgaUL1,dgaUL4,dgaLL5,dgaLL4,dgaLL1,dgaLR1,dgaLR4,dgaLR5,res6mcage,res6mcbetterstart
61,13149,66,11464P1,1431,,comprehensive_care,3,1,,Nil treatment,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Sealed,Sealed,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Sealed,,,,,,,,,,,66,No
63,13149,66,11464P1,1431,,comprehensive_care,3,2,,Nil treatment,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Sealed,Sealed,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Sealed,,,,,,,,,,,66,No
62,13149,97,11464P1,1431,,comprehensive_care,3,3,,Filled,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Filled,,,,,,,,,,,96,No


Merged Entries:


Unnamed: 0,ChildID,AgeMths,PregnancyID,MotherID,FatherID,dgaGAType,dgaGATotal,dgaGASequence,dgaWeight,dgaUR6,dgaURE,dgaURD,dgaURC,dgaURB,dgaURA,dgaULA,dgaULB,dgaULC,dgaULD,dgaULE,dgaUL6,dgaLL6,dgaLLE,dgaLLD,dgaLLC,dgaLLB,dgaLLA,dgaLRA,dgaLRB,dgaLRC,dgaLRD,dgaLRE,dgaLR6,dgaUR4,dgaUR1,dgaUL1,dgaUL4,dgaLL5,dgaLL4,dgaLL1,dgaLR1,dgaLR4,dgaLR5,res6mcage,res6mcbetterstart
1066,13149,66.0,11464P1,1431.0,,comprehensive_care,3.0,1.0,,Nil treatment,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Sealed,Sealed,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Sealed,,,,,,,,,,,,
1077,13149,66.0,,,,comprehensive_care,3.0,2.0,,Nil treatment,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Extracted,Sealed,Sealed,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Sealed,,,,,,,,,,,,
1083,13149,97.0,,,,comprehensive_care,3.0,3.0,,Filled,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Extracted,Extracted,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Nil treatment,Filled,,,,,,,,,,,,


It seems something has gone wrong with the kids that have been given 3 GAs - the entries for the first and second GAs have been duplicated - all the obs for entries with sequence 1 or 2 are identical (other than the sequence obs itself)

probs something went wrong with data extract or method for counting kids that appear multiple times

remove for now:

In [7]:
duplicate_ga_entries = ga_data.dgaGASequence == 2
ga_data = ga_data[~duplicate_ga_entries]

duplicate_merged_entries = merge_data.dgaGASequence == 2
merge_data = merge_data[~duplicate_merged_entries]

## Test That Epi / GA data agree with merged data

EPI data is fine (we don't care about age cos we binned that)

In [8]:
epi_test_merge = epi_data.merge(merge_data, on="ChildID", how="left")

problem_idxs = []
for col in epi_test_merge.columns:
    if col[-2:] == "_x":
        x_col = col
        y_col = col[:-1] + "y"

        match_mask = epi_test_merge[x_col] == epi_test_merge[y_col]
        both_na_mask = epi_test_merge[x_col].isna() & epi_test_merge[y_col].isna()

        n_same = sum(match_mask)
        n_both_na =  sum(both_na_mask)
        n_same += n_both_na

        if n_same != len(epi_test_merge):
            print("=" * 80)
            print(f"problem with {col[:-2]}")
            print(f"{n_same - len(epi_test_merge)} columns dont match")
            print()
            problem_idxs += list(
                epi_test_merge[~match_mask & ~both_na_mask].index
            )

problem with AgeMths
-354 columns dont match



GA data not so much:

In [9]:
ga_test_merge = ga_data.merge(merge_data,
                              on=["ChildID", "AgeMths"],
                              how="left")

problem_idxs = {}
for col in ga_test_merge.columns:
    if col[-2:] == "_x":
        x_col = col
        y_col = col[:-1] + "y"

        match_mask = ga_test_merge[x_col] == ga_test_merge[y_col]
        both_na_mask = ga_test_merge[x_col].isna() & ga_test_merge[y_col].isna()

        n_same = sum(match_mask)
        n_both_na =  sum(both_na_mask)
        n_same += n_both_na

        if n_same != len(ga_test_merge):
            print("=" * 80)
            print(f"problem with {col[:-2]}")
            print(f"{n_same - len(ga_test_merge)} columns dont match")
            print()
            problem_idxs[col[:-2]] = list(
                ga_test_merge[~match_mask & ~both_na_mask].ChildID.values
            )

problem with PregnancyID
-6 columns dont match

problem with MotherID
-6 columns dont match

problem with FatherID
-1 columns dont match

problem with dgaWeight
-768 columns dont match

problem with res6mcage
-1075 columns dont match

problem with res6mcbetterstart
-1051 columns dont match



Not too bothered about Mother / Pregnancy / Father ID at mo

* dgaWeight - has been rounded and there appear to have been rounding errors - replace with col from GA data
* res6mcage / res6mcbetterstart - like AgeMths have been overwritten in merged data and are all NaNs - just copy across from ga data where blank

In [10]:
merge_data.drop("dgaWeight", axis=1, inplace=True)
# merge AgeMths col from GA data into merged data
ga_key_cols = ["ChildID", "AgeMths"]
merge_data = merge_data.merge(ga_data[ga_key_cols + ["dgaWeight"]],
                              how="left",
                              on=ga_key_cols)

mc_cols = ["res6mcage", "res6mcbetterstart"]
merge_data = merge_data.merge(ga_data[ga_key_cols + mc_cols],
                              how="left",
                              on=ga_key_cols,
                              suffixes=("", "_ga"))
mcage_is_na = merge_data.res6mcage.isna()
merge_data.loc[mcage_is_na, "res6mcage"] = merge_data[mcage_is_na].res6mcage_ga

bs_is_na = merge_data.res6mcbetterstart.isna()
merge_data.loc[bs_is_na, "res6mcbetterstart"] = (
    merge_data[bs_is_na].res6mcbetterstart_ga
)
merge_data.drop("res6mcbetterstart_ga", axis=1, inplace=True)

# Total extraction columns

Merged data includes 3 columns that aren't in GA data but are definitely part of the GA information:

`totalnumberofprimaryextractions`

`totalnumberofsecondaryextractions`

`Totalnoextractions`

Each pretty self-explanatory. First issue - primary & secondary extractions aren't detailed in ga data, so no way to check where this info comes from / how it's calculated.

Next - primary extractions + secondary extractions != total extractions:

In [11]:
extractions_sum = (merge_data.totalnumberofprimaryextractions +
                   merge_data.totalnumberofpermanentextractions)
n_agreement = sum(merge_data.Totalnoextractions == extractions_sum)

print(f"in {n_agreement} entries primary extractions + secondary "
      "extractions = total extractions")

in 353 entries primary extractions + secondary extractions = total extractions


manually aggregating extractions from tooth level data bears more fruit

In [12]:
tooth_cols = [col for col in merge_data.columns
              if col[:4] in ["dgaU", "dgaL"]]

aggregated_extractions = np.zeros(len(merge_data))

for col in tooth_cols:
    aggregated_extractions += merge_data[col] == "Extracted"

total_agreement = sum(
    merge_data.Totalnoextractions == aggregated_extractions
)
print(f"in {total_agreement} entries aggregated extractions = total extractions")
man_sum_agreement = sum(aggregated_extractions == extractions_sum)
print(f"in {man_sum_agreement} entries aggregated extractions = primary + secondary extractions")

in 353 entries aggregated extractions = total extractions
in 1077 entries aggregated extractions = primary + secondary extractions


seems the calculations for Totalnoextractions have been done incorrectly, but the tooth-level data and the counts of primary and secondary extractions agree.

So, just delete total extractions column and replace with aggregates - primary secondary totals agree otherwise

In [13]:
merge_data["Totalnoextractions"] = aggregated_extractions

# Split Merged Data

Not much use in having GA and Epi datasets merged - split and remove any irrelevant columns:


In [17]:
merge_cols = [col for col in merge_data.columns
              if col not in ga_data.columns
              and col not in epi_data.columns]

is_epi = merge_data.Epi == 1
is_ga = ~merge_data.AgeMths.isna()

ga_data_final = merge_data[is_ga][list(ga_data.columns) + merge_cols]
epi_data_final = merge_data[is_epi][list(epi_data.columns) + merge_cols]

rename_cols = {
    'admincgender': 'gender',
    'agefm_fbqall': 'father_age_at_q',
    'agemm_mbqall': 'mother_age_at_q',
    'ben0mentst': 'on_benefits',
    'bib6a02': 'describe_health_q',
    'bib6b04': 'has_diagnosis_q',
    'bib6b05': 'hospital_admission_q',
    'deminfeth3gpcomb': 'ethnicity',
    'edcont_eal': 'english_additional_lang',
    'edcont_lac': 'looked_after_child',
    'edcont_sen': 'special_ed_needs',
    'edu0fthede': 'father_highest_ed',
    'edu0mumede': 'mother_highest_ed',
    'eth0eth9gp': 'mother_ethnicity',
    'fbqageeduc': 'age_father_complete_ed',
    'fbqcountrybirth': 'father_birthplace',
    'imd_2010_decile_nat': 'imd_2010_decile',
    'mbqlcasep5gp': 'socio_economic_pos',
    'org0agemuk': 'age_mother_moved_uk',
    'org0mmubct': 'mother_birthplace',
    'qad0langua': 'questionaire_language',
    'Totalnoextractions': 'n_extractions',
    'totalnumberofprimaryextractions': 'n_primary_extract',
    'totalnumberofpermanentextractions': 'n_perm_extract',
    'AgeMths': 'age_at_ga',
    'ddsdt': 'decayed_teeth',
    'ddsmt':'missing_teeth',
    'ddsft': 'filled_teeth',
    'ddsdmft': 'dmft',
    'dgaGAType': 'type_of_ga',
    'dgaGATotal':'total_ga',
    'dgaGASequence': 'ga_sequence',
    'dgaWeight': 'weight_at_ga',
    'dgaUR6':'ur6',
    'dgaURE':'urE',
    'dgaURD':'urd',
    'dgaURC':'urc',
    'dgaURB':'urb',
    'dgaURA':'ura',
    'dgaULA':'ula',
    'dgaULB':'ulb',
    'dgaULC':'ulc',
    'dgaULD':'uld',
    'dgaULE':'ule',
    'dgaUL6':'ul6',
    'dgaLL6':'ll6',
    'dgaLLE':'lle',
    'dgaLLD':'lld',
    'dgaLLC':'llc',
    'dgaLLB':'llb',
    'dgaLLA':'lla',
    'dgaLRA':'lra',
    'dgaLRB':'lrb',
    'dgaLRC':'lrc',
    'dgaLRD':'lrd',
    'dgaLRE':'lre',
    'dgaLR6':'lr6',
    'dgaUR4':'ur4',
    'dgaUR1':'ur1',
    'dgaUL1':'ul1',
    'dgaUL4':'ul4',
    'dgaLL5':'ll5',
    'dgaLL4':'ll4',
    'dgaLL1':'ll1',
    'dgaLR1':'lr1',
    'dgaLR4':'lr4',
    'dgaLR5':'lr5'
}
ga_drop_cols = ["GATx", "Epi"]
epi_drop_cols = ga_drop_cols + ["age_at_ga"]

epi_data_final.rename(rename_cols, axis=1, inplace=True)
ga_data_final.rename(rename_cols, axis=1, inplace=True)


In [18]:
epi_data_final

Unnamed: 0,ChildID,PregnancyID,MotherID,FatherID,age_at_ga,decayed_teeth,missing_teeth,filled_teeth,dmft,ddscareindex,res6mcage,res6mcbetterstart,gender,father_age_at_q,mother_age_at_q,on_benefits,describe_health_q,has_diagnosis_q,hospital_admission_q,ethnicity,english_additional_lang,looked_after_child,special_ed_needs,father_highest_ed,mother_highest_ed,mother_ethnicity,age_father_complete_ed,father_birthplace,has_bib1kc,imd_2010_decile,socio_economic_pos,age_mother_moved_uk,mother_birthplace,questionaire_language,GATx,Epi,n_extractions,n_primary_extract,n_perm_extract,res6mcage_ga
200,19222,12930P2,2846.0,32339.0,94.0,1.0,0.0,0.0,1.0,0.0,66.0,No,Male,339.0,271.0,Yes,,,,White British,No,No,SEN Support,5 GCSE equivalent,5 GCSE equivalent,White British,16 or under,United Kingdom,,1.0,Employed no access to money,,England,English,1.0,1.0,9.0,9.0,0.0,78.0
201,21558,17527P1,7256.0,31664.0,91.0,0.0,0.0,2.0,2.0,1.0,66.0,No,Female,272.0,269.0,No,,,,Pakistani,Yes,No,SEN Support,5 GCSE equivalent,<5 GCSE equivalent,Pakistani,17-19,United Kingdom,,5.0,Benefits but coping,19.0,Pakistan,Urdu,1.0,1.0,8.0,6.0,2.0,78.0
202,22693,19462P1,8916.0,,56.0,0.0,8.0,0.0,8.0,0.0,66.0,No,Female,,,,,,,Pakistani,Yes,No,Not SEN,,,,,,,,,,,,1.0,1.0,8.0,8.0,0.0,54.0
203,25537,22516P1,11613.0,,39.0,0.0,16.0,0.0,16.0,0.0,60.0,No,Male,,494.0,Yes,,,,Pakistani,Yes,No,Not SEN,Higher than A-level,5 GCSE equivalent,Pakistani,,,,1.0,Employed no access to money,4.0,Pakistan,English,1.0,1.0,17.0,16.0,1.0,36.0
1038,18794,15935P1,5745.0,,72.0,0.0,0.0,0.0,0.0,0.0,66.0,Yes,Female,,367.0,Yes,Very good,No,No,Pakistani,No,No,Not SEN,<5 GCSE equivalent,<5 GCSE equivalent,Pakistani,,,has,1.0,Benefits but coping,,Pakistan,English,1.0,1.0,9.0,9.0,0.0,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,19117,18712P1,8322.0,31892.0,,10.0,0.0,0.0,10.0,0.0,66.0,Yes,Male,269.0,,,,,,Other,Yes,No,Not SEN,,,,20 or over,Other,,,,,,,2.0,1.0,0.0,,,
1401,23634,19030P1,8582.0,,,7.0,2.0,1.0,10.0,0.0,66.0,Yes,Male,,,,,,,Pakistani,No,No,Not SEN,,,,,,,,,,,,2.0,1.0,0.0,,,
1402,23747,41018P1,12417.0,,,10.0,0.0,0.0,10.0,0.0,66.0,No,Male,,,,,,,Pakistani,No,No,Not SEN,,,,,,,,,,,,2.0,1.0,0.0,,,
1403,26372,22185P1,11302.0,,,0.0,10.0,0.0,10.0,0.0,60.0,No,Male,,241.0,Yes,,,,White British,No,No,SEN Support,Don't know,<5 GCSE equivalent,White British,,,,1.0,Benefits but coping,,England,English,2.0,1.0,0.0,,,


In [21]:
ga_data_final.has_bib1kc.value_counts()

has    140
Name: has_bib1kc, dtype: int64