In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import time


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/uci-semcom/uci-secom.csv")

In [None]:
df.head()

In [None]:
df_feat = df.iloc[:,1:-1]

In [None]:
df_feat.dropna(axis=1, inplace=True)

In [None]:
!pip install causalnex

In [None]:
from causalnex.structure.notears import from_pandas
from causalnex.structure import StructureModel

In [None]:
%%time
SM = from_pandas(df_feat)

In [None]:
import networkx as nx

In [None]:
plt.figure(figsize=(18,10))
pos = nx.spring_layout(SM, k=60)
 
edge_width = [ d['weight']*0.3 for (u,v,d) in SM.edges(data=True)]
#nx.draw_networkx_labels(SM, pos, fontsize=16, font_family="Yu Gothic", font_weight="bold")
nx.draw_networkx_labels(SM, pos, font_family="Yu Gothic", font_weight="bold")
nx.draw_networkx(SM,
                 pos,
                 node_size=4000,
                 arrowsize=20,
                 alpha=0.6,
                 edge_color='b',
                 width=edge_width)

In [None]:
COPY_SM = SM.copy()
 
# 因果関係の弱いエッジを削除
COPY_SM.remove_edges_below_threshold(0.5)
 
# 可視化
plt.figure(figsize=(18,10))
pos = nx.spring_layout(COPY_SM, k=60)
 
edge_width = [ d['weight']*0.3 for (u,v,d) in COPY_SM.edges(data=True)]
#nx.draw_networkx_labels(COPY_SM, pos, fontsize=16, font_family="Yu Gothic", font_weight="bold")
nx.draw_networkx_labels(COPY_SM, pos, font_family="Yu Gothic", font_weight="bold")
nx.draw_networkx(COPY_SM,
                 pos,
                 node_size=4000,
                 arrowsize=20,
                 alpha=0.6,
                 edge_color='b',
                 width=edge_width)

# Data test

In [None]:
df_feat

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# create instance
sm = StructureModel()

In [None]:
# define previous relation
sm.add_edges_from([
    ('20', '86'),
    ('20', '87')
])

In [None]:
# check the dag
sm.edges

In [None]:
!pip install --upgrade pip

In [None]:
!conda install -y graphviz pygraphviz

In [None]:
import pygraphviz

In [None]:
from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

In [None]:
viz = plot_structure(
    sm,
    graph_attributes={"scale":"0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format="png"))

In [None]:
SM

In [None]:
viz = plot_structure(
    SM,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
# remove weak network
SM.remove_edges_below_threshold(0.8)

In [None]:
viz = plot_structure(
    SM,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
%%time
# Drop mistake conection from speciality knowlege
SM = from_pandas(df_feat, tabu_edges=[("250", "388"),("251","524")], w_threshold=0.8)

In [None]:
viz = plot_structure(
    SM,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
# lagrgest network
SM = SM.get_largest_subgraph()

In [None]:
viz = plot_structure(
    SM,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
# create bayesian network instance
from causalnex.network import BayesianNetwork

bn = BayesianNetwork(SM)

In [None]:
# Bayesian Networks in CausalNex support only discrete distributions.
# So change to continuous value to discrete values and distributions

col = df_feat.columns
col

In [None]:
# library for make dsicrete value
from causalnex.discretiser import Discretiser

df_c = df_feat.copy()

for i in range(len(col)):
    c = col[i]
    df_c[c] = Discretiser(method="fixed",
                          numeric_split_points=[df_c[c].quantile(0.5)]).transform(df_c[c].values)

In [None]:
df_c

In [None]:
df_feat

In [None]:
plt.scatter(df_feat["86"], df_c["86"])

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_c, train_size=0.9, test_size=0.1, random_state=7)

In [None]:
# 1st, The first step in this is specifying all of the states that each node can take.
bn = bn.fit_node_states(df_c)

In [None]:
%%time
# fit to train data
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

In [None]:
# upper condition
# lower Conditional probability
bn.cpds["527"]

In [None]:
bn.cpds["390"]

In [None]:
# prediction
predictions = bn.predict(test, "527")

In [None]:
predictions

In [None]:
test["527"]

In [None]:
# accuracy score etc.
from causalnex.evaluation import classification_report
classification_report(bn, test, "527")

In [None]:
from causalnex.evaluation import roc_auc
roc, auc = roc_auc(bn, test, "527")
print(auc)

# various images

In [None]:
Image(viz.draw(
    format="png", prog="circo"  # `circo` produces a circular layout
),width=500)

In [None]:
def get_tree(k_list):
    """
    Return the edges of a tree given the number of children at each level
    """
    n = 1
    edges_radial = []
    for level in range(0, len(k_list)):
        k = k_list[level]
        edges_radial.extend(
            [(f"A{level}_{i // k}", f"A{level + 1}_{i}") for i in range(k * n)]
        )
        n = k * n
    return edges_radial


layouts = [
    ("dot","Order nodes hierarchly. Great to spot the dependencies of a causal network.",[2, 3, 3]),
    ("neato", "Spring model. Great default tool if the graph is not too large", [2, 2, 4, 3, 2]),
    ("sfdp", "A different style of spring model", [2, 2, 4, 3, 2]),
    ("twopi", "Radial layout", [2, 2, 5, 3]),
]

for layout, description, k_list in layouts:
    g_tree = StructureModel(get_tree(k_list))
    viz = plot_structure(g_tree)

    print(f"{layout}: {description}")
    image_binary = viz.draw(format="png", prog=layout)
    display(Image(image_binary, width=500))

In [None]:
graph_attributes = {
    "splines": "spline",  # I use splies so that we have no overlap
    "ordering": "out",
    "ratio": "fill",  # This is necessary to control the size of the image
    "size": "16,9!",  # Set the size of the final image. (this is a typical presentation size)
    "label": "The structure of our\n \t Insurance model",
    "fontcolor": "#FFFFFFD9",
    "fontname": "Helvetica",
    "fontsize": 100,
    "labeljust": "l",
    "labelloc": "t",
    "pad": "1,1",
    "dpi": 200,
    "nodesep": 0.8,
    "ranksep": ".5 equally",
}

# Making all nodes hexagonal with black coloring
node_attributes = {
    node: {
        "shape": "hexagon",
        "width": 2.2,
        "height": 2,
        "fillcolor": "#000000",
        "penwidth": "10",
        "color": "#4a90e2d9",
        "fontsize": 35,
        "labelloc": "c",
    }
    for node in SM.nodes
}

# Splitting two words with "\n"
for node in SM.nodes:
    up_idx = [i for i, c in enumerate(node) if c.isupper()][-1]
    node_attributes[node]["label"] = node[:up_idx] + "\n" + node[up_idx:]

# Target nodes (ones with "Cost" in the name) are colored differently
for node in SM.nodes:
    if "Cost" in node:  # We color nodes with "cost" in the name with a orange colour.
        node_attributes[node]["fillcolor"] = "#DF5F00"

# Customising edges
edge_attributes = {
    (u, v): {
        "penwidth": w * 20 + 2,  # Setting edge thickness
        "weight": int(5 * w),  # Higher "weight"s mean shorter edges
        "arrowsize": 2 - 2.0 * w,  # Avoid too large arrows
        "arrowtail": "dot",
    }
    for u, v, w in SM.edges(data="weight")
}


viz = plot_structure(
    SM,
    prog="dot",
    graph_attributes=graph_attributes,
    node_attributes=node_attributes,
    edge_attributes=edge_attributes,
)
f = "supporting_files/final_insurance_model.jpg"
viz.draw(f)
Image(f)