# Initialization

Import packages and setup

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_classif, r_regression, chi2
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.neighbors import KNeighborsClassifier

import cdt
# cdt.SETTINGS.rpath = r'C:\Program Files\R\R-4.2.2\bin\Rscript'
from cdt.causality.graph import CGNN

import warnings
warnings.filterwarnings('ignore')

# Set the random seed for reproducibility
seed = 0
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.


Load dataset

In [2]:
def load_data(url):
    df = pd.read_csv(url, index_col=0)
    X = df.loc[:, df.columns != 'Churn Value']
    y = df.loc[:, df.columns == 'Churn Value']
    return df, X, y

df_train, X_train, y_train = load_data('https://raw.githubusercontent.com/prat-man/CSE-572-Data-Mining/main/data/train_data.csv')
df_val, X_val, y_val = load_data('https://raw.githubusercontent.com/prat-man/CSE-572-Data-Mining/main/data/val_data.csv')
df_test, X_test, y_test = load_data('https://raw.githubusercontent.com/prat-man/CSE-572-Data-Mining/main/data/test_data.csv')

# Causal Discovery

Define features and outcomes

In [3]:
features = X_train.columns.tolist()
outcomes = y_train.columns.tolist()

Discover causal structures

In [4]:
model = CGNN()
skeleton = nx.Graph(df_train.cov())
model_graph = model.predict(df_train, skeleton)

KeyboardInterrupt: 

Detect relevant nodes and edges

In [None]:
relevant_features = set()
relevant_nodes = set()
relevant_edges = set()

for feature in features:
    for outcome in outcomes:
        if model_graph.has_edge(feature, outcome):
            relevant_features.add(feature)
            relevant_nodes.add(feature)
            relevant_nodes.add(outcome)
            relevant_edges.add((feature, outcome))

for feature in sorted(relevant_features):
    print(feature)

Draw the causal graph

In [None]:
fig = plt.figure(figsize=(10, 8))
for i in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[i].set_visible(False)

degrees = model_graph.degree()
node_colors = ['pink' if node in outcomes else 'bisque' if node in relevant_features else 'oldlace' for node in model_graph.nodes]
node_sizes = [max(v, 1) * 100 for k, v in degrees]
node_edge_colors = ['lightcoral' if node in relevant_nodes else 'gainsboro' for node in model_graph.nodes]
edge_colors = ['lightcoral' if edge in relevant_edges else 'gainsboro' for edge in model_graph.edges]
pos = nx.shell_layout(model_graph, [outcomes, relevant_features, list(set(features) - set(relevant_features))])

nx.draw_networkx(model_graph,
                 pos=pos,
                 connectionstyle="arc3,rad=0.1",
                 node_color=node_colors,
                 node_size=node_sizes,
                 edgecolors=node_edge_colors,
                 edge_color=edge_colors,
                 font_size=10,
                 font_color='dimgray',
                 arrowsize=15,
                 clip_on=False)

plt.tight_layout()
plt.show()
# plt.savefig('graph.png', dpi=300)

# Evaluation

In [None]:
def evaluate(clf, x, y):
    y_pred = clf.predict(x)
    print(f'Overall accuracy: {accuracy_score(y, y_pred)}\n')
    print(classification_report(y, y_pred, zero_division=0))

In [None]:
def evaluateKNN(x_train, x_val, x_test):
    # hyperparameter tuning
    loss = []
    x_axis = range(5, 41, 5)
    
    for i in x_axis:
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_val)
        loss.append(log_loss(y_val, y_pred))

    # plot
    fig = plt.figure(figsize=(5, 3))
    plt.xticks(x_axis)
    plt.plot(x_axis, loss)
    plt.show()
    
    # find optimal neighbors
    sorted_loss = np.argsort(loss)
    n_neighbors = sorted_loss[0] * 5 + 5
    print(f'\nOptimal neighbors: {n_neighbors}\n')

    # test
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(x_train, y_train)
    evaluate(knn, x_test, y_test)

## All Features

In [None]:
evaluateKNN(X_train, X_val, X_test)

## Causality Based Features

In [None]:
X_train_causal = X_train[list(relevant_features)]
X_val_causal = X_val[list(relevant_features)]
X_test_causal = X_test[list(relevant_features)]

In [None]:
evaluateKNN(X_train_causal, X_val_causal, X_test_causal)

## ANOVA

In [None]:
anova = SelectKBest(f_classif, k=6).fit(X_train, y_train)

anova_features = X_train.iloc[:, np.argsort(anova.scores_)[:6]].columns.values

for feature in sorted(anova_features):
    print(feature)

In [None]:
X_train_anova = anova.transform(X_train)
X_val_anova = anova.transform(X_val)
X_test_anova = anova.transform(X_test)

In [None]:
evaluateKNN(X_train_anova, X_val_anova, X_test_anova)

## Pearson

In [None]:
pearson = SelectKBest(r_regression, k=6).fit(X_train, y_train)

pearson_features = X_train.iloc[:, np.argsort(pearson.scores_)[:6]].columns.values

for feature in sorted(pearson_features):
    print(feature)

In [None]:
X_train_pearson = pearson.transform(X_train)
X_val_pearson = pearson.transform(X_val)
X_test_pearson = pearson.transform(X_test)

In [None]:
evaluateKNN(X_train_pearson, X_val_pearson, X_test_pearson)

## PCA

In [None]:
pca = PCA(n_components=6).fit(X_train)

In [None]:
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [None]:
evaluateKNN(X_train_pca, X_val_pca, X_test_pca)