# Common Code for Experiments

This notebook contains common code that is used in the various causal experiments done in AitiaExplorer.

## Contents

TBD

## References

Portions of the code dealing with selecting the optimal number of clusters is based on the article and code provided at:


https://towardsdatascience.com/gaussian-mixture-model-clusterization-how-to-select-the-number-of-components-clusters-553bef45f6e4

In [2]:
# imports

import os
import sys
import pandas as pd
import networkx as nx
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pycausal.pycausal import pycausal

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from aitia_explorer.app import App

# stop the warning clutter
import warnings
warnings.filterwarnings('ignore')

In [1]:
def select_best(arr, x):
    """
    returns the set of x configurations with shorter distance
    """
    dx = np.argsort(arr)[:x]
    return arr[dx]

In [5]:
def get_bic_scores(min, max):
    n_clusters = np.arange(min, max)
    bics = []
    bics_err = []
    iterations = max
    for n in n_clusters:
        tmp_bic = []
        for _ in range(iterations):
            gmm = mixture.GaussianMixture(n, n_init=2).fit(X)
            tmp_bic.append(gmm.bic(X))
        val = np.mean(select_best(np.array(tmp_bic), int(iterations / 5)))
        err = np.std(tmp_bic)
        bics.append(val)
        bics_err.append(err)
    return bics, bics_err

In [6]:
def plot_bic_aic_scores(bics, bics_err):
    plt.errorbar(n_clusters, bics, yerr=bics_err, label='BIC')
    plt.title("BIC and AIC Scores", fontsize=20)
    plt.xticks(n_clusters)
    plt.xlabel("N. of clusters")
    plt.ylabel("Score")
    plt.legend()

In [8]:
def plot_bic_gradient(bics, bics_err):
    plt.errorbar(n_clusters, np.gradient(bics), yerr=bics_err, label='BIC')
    plt.title("Gradient of BIC Scores", fontsize=20)
    plt.xticks(n_clusters)
    plt.xlabel("N. of clusters")
    plt.ylabel("grad(BIC)")
    plt.legend()