In [1]:
import sys
sys.path.append('/Users/samrelins/Documents/LIDA/dental_project/src/')

from data_prep import *
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from IPython.display import Image
import pandas as pd
import plotly.express as px


data_dir = "../data/erin_data"

ga_data, epi_data = build_epi_ga_data(data_dir)
ga_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1077 entries, 0 to 1076
Data columns (total 73 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age_at_ga                1077 non-null   float64
 1   type_of_ga               1077 non-null   object 
 2   total_ga                 1077 non-null   float64
 3   ga_sequence              1077 non-null   float64
 4   weight_at_ga             909 non-null    float64
 5   ur6                      1077 non-null   object 
 6   urE                      1077 non-null   object 
 7   urd                      1077 non-null   object 
 8   urc                      1077 non-null   object 
 9   urb                      1077 non-null   object 
 10  ura                      1077 non-null   object 
 11  ula                      1077 non-null   object 
 12  ulb                      1077 non-null   object 
 13  ulc                      1077 non-null   object 
 14  uld                     

### Simple DAG describing No of Extractions

In [None]:
# prep data for simple DAG
dag_features = ['age_at_ga', 'type_of_ga', 'ethnicity', 'special_ed_needs',
                'imd_2010_decile', 'n_extractions']

# drop unnecessary cols and na vals from numeric features
encoded_data = ga_data[dag_features].copy()
encoded_data.dropna(inplace=True)
encoded_data.info()

# numerically encode categorical features
for col in encoded_data.columns:
    if encoded_data[col].dtype == np.dtype("object"):
        encoded_data[col] = encoded_data[col].astype("category").cat.codes

In [None]:

sm = from_pandas(encoded_data)

sm.remove_edges_below_threshold(0.9)
viz = plot_structure(
    sm,
    graph_attributes={"scale": "1"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK
)

Image(viz.draw(format="png"))

In [None]:
smaller = ga_data.age_at_ga < 48
small = ga_data.age_at_ga < 96
big = ga_data.age_at_ga < 200

print(sum(smaller))
print(sum(small & ~ smaller))
print(sum(big & ~small & ~ smaller))

In [None]:
from causalnex.discretiser import Discretiser

discretised_data = ga_data[dag_features].copy()
discretised_data.dropna(inplace=True)
discretised_data.reset_index(drop=True, inplace=True)


In [None]:
age_map = {0: "under_4", 1: "mid", 2: "over_8"}
discretised_data["age_at_ga"] = pd.Series(
    Discretiser(method="fixed",
                numeric_split_points=[48, 96])
        .transform(discretised_data.age_at_ga.values)
).map(age_map)

ext_map = {0: "less_than_7", 1: "mid", 2: "more_than_10"}
discretised_data["n_extractions"] = pd.Series(
    Discretiser(method="fixed",
                numeric_split_points=[7, 11])
        .transform(discretised_data.n_extractions.values)
).map(ext_map)

imd_map = {0: "imd_1", 1: "imd_1+"}
discretised_data["imd_2010_decile"] = pd.Series(
    Discretiser(method="fixed",
                numeric_split_points=[2])
        .transform(discretised_data.imd_2010_decile.values)
).map(imd_map)


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(discretised_data, train_size=0.8, test_size=0.2)

In [None]:
from causalnex.network import BayesianNetwork
bn = BayesianNetwork(sm)

bn.fit_node_states(discretised_data)
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

In [None]:
bn.cpds["n_extractions"]

In [None]:
from causalnex.evaluation import classification_report
classification_report(bn, test, "n_extractions")

In [None]:
test = test.join(bn.predict_probability(test, "n_extractions"))
pred_cols = ['n_extractions_more_than_10', 'n_extractions_less_than_7',
             'n_extractions_mid']
preds_df = (test[pred_cols + ["n_extractions"]]
            .groupby("n_extractions").agg(["mean", "sem"])
            .reset_index()
            .melt(id_vars="n_extractions")
            .pivot(index=["n_extractions", "variable_0"],
                   columns="variable_1", values="value")
            .reset_index()
            )

fig = px.line(preds_df,
              x="n_extractions",
              y="mean",
              color="variable_0",
              error_y="sem")
fig.show()

In [None]:
from causalnex.inference import InferenceEngine

ie = InferenceEngine(bn)
marginals = ie.query()
marginals["n_extractions"]

In [None]:
marginals_uk = ie.query({"ethnicity": "White British"})
print("=" * 80)
print("Marginal n_extractions | White British:\n")
print(marginals_uk['n_extractions'])

marginals_pak = ie.query({"ethnicity": "Pakistani"})
print("=" * 80)
print("Marginal n_extractions | White British:\n")
print(marginals_pak['n_extractions'])

In [None]:
marginals_uk = ie.query({"ethnicity": "White British",
                         "imd_2010_decile": "imd_1"})
print("=" * 80)
print("Marginal n_extractions | White British:\n")
print(marginals_uk['n_extractions'])

marginals_pak = ie.query({"ethnicity": "Pakistani",
                          "imd_2010_decile": "imd_1"})
print("=" * 80)
print("Marginal n_extractions | White British:\n")
print(marginals_pak['n_extractions'])