# Causal Learning

## Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from causalnex.discretiser import Discretiser
from causalnex.structure import DAGRegressor
from causalnex.inference import InferenceEngine
from causalnex.network import BayesianNetwork
from causalnex.network.sklearn import BayesianNetworkClassifier
from causalnex.structure.notears import from_pandas
from causalnex.utils.network_utils import get_markov_blanket
from causalnex.structure.notears import from_pandas, from_pandas_lasso
from causalnex.discretiser.discretiser_strategy import ( DecisionTreeSupervisedDiscretiserMethod )

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score

ModuleNotFoundError: No module named 'causalnex'

In [None]:
sns.set()
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [None]:
from file_handler import FileHandler
import vis_seaborn as vs
from eda_helper import Helper
from df_overview import DfOverview
from ScalerNormalizer import ScalerNormalizer

In [None]:
helper = Helper()
sn = ScalerNormalizer()

## Data reading

In [None]:
fh = FileHandler()
df = fh.read_csv("../data/data.csv")
df.head(5)

In [None]:
feat = ['diagnosis', 'perimeter_worst', 'area_worst', 'radius_worst', 'concave points_worst', 'concave points_mean', 'perimeter_mean', 'area_mean', 'radius_mean', 'area_se', 'concavity_mean', 'concavity_worst', 'perimeter_se', 'radius_se', 'compactness_worst', 'compactness_mean', 'texture_worst', 'concave points_se', 'smoothness_worst', 'texture_mean', 'symmetry_worst', 'concavity_se', 'smoothness_mean', 'symmetry_mean', 'compactness_se', 'fractal_dimension_worst', 'fractal_dimension_se', 'texture_se', 'fractal_dimension_mean', 'symmetry_se', 'smoothness_se']

In [None]:
x = df[feat[:29]]
x['diagnosis'] = x['diagnosis'].apply(lambda x: 1 if x == "M" else 0)
x.head(5)

In [None]:
num_feat = x.shape[1]-1
fig = plt.figure(figsize=(25, num_feat))
fig.subplots_adjust(hspace=.5)
axs = [fig.add_subplot(num_feat / 4, 4, i + 1) for i in range(num_feat)]

for i, column in enumerate(x.columns[1:]):
    sns.distplot(x[x.diagnosis == 1][column], ax=axs[i], color='darkorange', label="M")
    sns.distplot(x[x.diagnosis == 0][column], ax=axs[i], color='darkgreen', label="B")
    axs[i].set_title('Distribution for {}'.format( column), fontweight='bold', fontsize=15)
    axs[i].legend(fontsize=20)
plt.show()

## Normalize the data

In [None]:
features = x.iloc[:, 1:]
features

In [None]:
normal_data = sn.scale_and_normalize(features, features.columns.to_list())
normal_data.insert(loc=0, column='diagnosis', value=x['diagnosis'])
normal_data

In [None]:
%%time
sm = from_pandas(normal_data.iloc[:, :9], tabu_parent_nodes=['diagnosis'],)

In [None]:
vs.vis_sm(sm)

In [None]:
sm.remove_edges_below_threshold(0.8)
vs.vis_sm(sm)

## Stablity of causal graph


In [None]:
x_selected = normal_data.iloc[:, :10]

### 50%

In [None]:
portion = int(x_selected.shape[0] *.5)
x_portion = x_selected.head(portion)

In [None]:
%%time
sm1 = from_pandas(x_portion, tabu_parent_nodes=['diagnosis'],)
sm1.remove_edges_below_threshold(0.8)
sm1 = sm1.get_largest_subgraph()
vs.vis_sm(sm1)

In [None]:
print(sm1.edges)

# 60%

In [None]:
portion = int(x_selected.shape[0]*.6)
x_portion = x_selected.head(portion)

In [None]:
%%time
sm2 = from_pandas(x_portion, tabu_parent_nodes=['diagnosis'],)
sm2.remove_edges_below_threshold(0.8)
sm2 = sm2.get_largest_subgraph()
vs.vis_sm(sm2)

In [None]:
def jaccard_similarity(g, h):
    i = set(g).intersection(h)
    return round(len(i) / (len(g) + len(h) - len(i)), 3)

In [None]:
jaccard_similarity(sm1.edges, sm2.edges)

# 70%

In [None]:
portion = int(x_selected.shape[0]*.7)
x_portion = x_selected.head(portion)

In [None]:
%%time
sm3 = from_pandas(x_portion, tabu_parent_nodes=['diagnosis'],)
sm3.remove_edges_below_threshold(0.8)
sm3 = sm3.get_largest_subgraph()
vs.vis_sm(sm3)

In [None]:
jaccard_similarity(sm2.edges, sm3.edges)

# 80%

In [None]:
portion = int(x_selected.shape[0] * .8)
x_portion = x_selected.head(portion)

In [None]:
%%time
sm4 = from_pandas(x_portion, tabu_parent_nodes=['diagnosis'],)
sm4.remove_edges_below_threshold(0.8)
sm4 = sm4.get_largest_subgraph()
vs.vis_sm(sm4)

In [None]:
jaccard_similarity(sm3.edges, sm4.edges)

# 90%

In [None]:
portion = int(x_selected.shape[0] * .9)
x_portion = x_selected.head(portion)

In [None]:
%%time
sm5 = from_pandas(x_portion, tabu_parent_nodes=['diagnosis'],)
sm5.remove_edges_below_threshold(0.8)
sm5 = sm5.get_largest_subgraph()
vs.vis_sm(sm5)

In [None]:
Casual Inference model for breast cancer

In [None]:
jaccard_similarity(sm5.edges, sm4.edges)

# 100%

In [None]:
%%time
sm = from_pandas(x_selected, tabu_parent_nodes=['diagnosis'],)
sm.remove_edges_below_threshold(0.8)
sm = sm.get_largest_subgraph()
vs.vis_sm(sm)

In [None]:
jaccard_similarity(sm.edges, sm5.edges)

## Reducing a graph to its Markov Blanket

Now, assume that target is our variable of interest. We actually do not need all the nodes in the network but only the MB of target. To achieve that, we simply need to use the get_markov_blanket function from causalnex. Specifically,

In [None]:
bn = BayesianNetwork(sm)
blanket = get_markov_blanket(bn, 'diagnosis')
edge_list = list(blanket.structure.edges)
edge_list

In [None]:
vs.vis_sm(blanket.structure)

## Discretising Numeric Features

In [None]:
x_selected = x.iloc[:, :10]
x_selected

In [None]:
df_overview = DfOverview(x_selected)
overview = df_overview.getOverview()
vs.view_df(overview[["count", "unique_value_count", "unique_percentage"]],
                    ["count", "unique_value_count"])

In [None]:
features = list(x_selected.columns.difference(['diagnosis']))
features

In [None]:
tree_discretiser = DecisionTreeSupervisedDiscretiserMethod(
    mode='single',
    tree_params={'max_depth': 3, 'random_state': 27},
)
tree_discretiser.fit(
    feat_names=features,
    dataframe=x,
    target_continuous=True,
    target='diagnosis',
)
tree_discretiser

In [None]:
discretised_data = x_selected.copy()
for col in features:
    discretised_data[col] = tree_discretiser.transform(x_selected[[col]])


In [None]:
df_overview = DfOverview(discretised_data)
overview = df_overview.getOverview()
vs.view_df(overview[["count", "unique_value_count", "unique_percentage"]],
           ["count", "unique_value_count"])

## Train Test Split

In [None]:
train, test = train_test_split( discretised_data, train_size=0.8, test_size=0.2, random_state=27)

## Bayesian Network

In [None]:
bn = BayesianNetwork(blanket.structure)
bn = bn.fit_node_states(discretised_data)
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

In [None]:
pred = bn.predict(test, 'diagnosis')
true = test['diagnosis']
pred

In [None]:
print('Recall: {:.2f}'.format(recall_score(y_true=true, y_pred=pred)))
print('F1: {:.2f} '.format(f1_score(y_true=true, y_pred=pred)))
print('Accuracy: {:.2f} '.format(accuracy_score(y_true=true, y_pred=pred)))
print('Precision: {:.2f} '.format(precision_score(y_true=true, y_pred=pred)))

## Inference


In [None]:
ie = InferenceEngine(bn)

What would be the Outcome if all people actually had healthy weight? — If all people were with healthy weight, there will be less positive diagnosis.(from 0.42 to 0.30)

In [None]:
observation_1 = {"area_mean": 1, "area_se": 1, "area_worst": 1}
observation_2 = {"area_mean": 2, "area_se": 2, "area_worst": 2}
marginals = ie.query([observation_1, observation_2])


In [None]:
for m in marginals:
  print(m['diagnosis'])