In [41]:
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [58]:
def element_entropy(C):
    E = np.empty_like(C)
    rows, cols = E.shape
    for row in range(rows):
        for col in range(cols):
            p = C[row,col]
            if p > 0:
                E[row,col] = -p * math.log(p, 2)
            else:
                E[row,col] = 0
    entrop = np.mean(E, axis=1)
    return entrop

In [59]:
root_folder = 'LPA_Data/by_mu/'
x = pd.DataFrame()

In [60]:
for mu in [2, 3, 4]:
    test = pd.read_csv(root_folder + 'node_x_test_mu_{0}.csv'.format(mu), index_col=0)
    train = pd.read_csv(root_folder + 'node_x_train_mu_{0}.csv'.format(mu), index_col=0)
    x = pd.concat([x, test, train])

In [61]:
co_mat = []
for mu in [2, 3, 4]:
    for graph in [1, 2, 3, 4, 5]:
        co_mat_file = 'Community_Data/LPA/Coassociation/graph_0{0}_mu_0_{1}_coassociation.npy'.format(graph, mu)
        co_mat.append(np.load(co_mat_file))

In [62]:
node_entropies = []
for mat in co_mat:
    entropies = element_entropy(mat)
    node_entropies += list(entropies)

In [63]:
median_cutoff = np.median(node_entropies)

In [64]:
kmeans = KMeans(n_clusters=2).fit(np.array(node_entropies).reshape(-1,1))
kmeans_cutoff = np.mean(kmeans.cluster_centers_)

In [65]:
median_y = np.where(node_entropies <= median_cutoff, 0, 1)
kmeans_y = np.where(node_entropies <= kmeans_cutoff, 0, 1)

In [66]:
indices = []
for mu in [2, 3, 4]:
    for graph in [1, 2, 3, 4, 5]:
        gr_indices = ['graph_{0}_{1}_node_{2}'.format(mu, graph, i) for i in range(1000)]
        indices += gr_indices

In [67]:
median_y_df = pd.DataFrame(median_y, index=indices, columns=['Stability'])
kmeans_y_df = pd.DataFrame(kmeans_y, index=indices, columns=['Stability'])

In [68]:
X_train, X_test, y_train, y_test = train_test_split(x, kmeans_y_df, stratify=kmeans_y_df, test_size=0.2)
final_folder = 'LPA_Data/all/'
X_train.to_csv(final_folder + 'node_x_train_kmeans.csv')
X_test.to_csv(final_folder + 'node_x_test_kmeans.csv')
y_train.to_csv(final_folder + 'node_y_train_kmeans.csv')
y_test.to_csv(final_folder + 'node_y_test_kmeans.csv')

In [69]:
X_train, X_test, y_train, y_test = train_test_split(x, median_y_df, stratify=median_y_df, test_size=0.2)
final_folder = 'LPA_Data/all/'
X_train.to_csv(final_folder + 'node_x_train.csv')
X_test.to_csv(final_folder + 'node_x_test.csv')
y_train.to_csv(final_folder + 'node_y_train.csv')
y_test.to_csv(final_folder + 'node_y_test.csv')

In [78]:
root_folder = 'Infomap_Data/by_mu/'
x = pd.DataFrame()
for mu in [2, 3, 4]:
    test = pd.read_csv(root_folder + 'node_x_test_mu_{0}.csv'.format(mu), index_col=0)
    train = pd.read_csv(root_folder + 'node_x_train_mu_{0}.csv'.format(mu), index_col=0)
    x = pd.concat([x, test, train])

In [79]:
for mu in [2, 3, 4]:
    for graph in [1, 2, 3, 4, 5]:
        co_mat_file = 'Community_Data/Infomap/Coassociation/graph_0{0}_mu_0_{1}_coassociation.npy'.format(graph, mu)
        co_mat.append(np.load(co_mat_file))
        node_entropies = element_entropy(mat)
        median_cutoff = np.median(node_entropies)
        kmeans = KMeans(n_clusters=2).fit(np.array(node_entropies).reshape(-1,1))
        kmeans_cutoff = np.mean(kmeans.cluster_centers_)
        indices = ['graph_{0}_{1}_node_{2}'.format(mu, graph, i) for i in range(1000)]
        median_y = np.where(node_entropies <= median_cutoff, 0, 1)
        kmeans_y = np.where(node_entropies <= kmeans_cutoff, 0, 1)
        median_y_df = pd.DataFrame(median_y, index=indices, columns=['Stability'])
        kmeans_y_df = pd.DataFrame(kmeans_y, index=indices, columns=['Stability'])
        x_df = x.loc[median_y_df.index]
        X_train, X_test, y_train, y_test = train_test_split(x_df, kmeans_y_df, stratify=kmeans_y_df, test_size=0.2)
        final_folder = 'Infomap_Data/by_graph/'
        X_train.to_csv(final_folder + 'node_x_train_kmeans_graph_{0}_{1}.csv'.format(mu, graph))
        X_test.to_csv(final_folder + 'node_x_test_kmeans_graph_{0}_{1}.csv'.format(mu, graph))
        y_train.to_csv(final_folder + 'node_y_train_kmeans_graph_{0}_{1}.csv'.format(mu, graph))
        y_test.to_csv(final_folder + 'node_y_test_kmeans_graph_{0}_{1}.csv'.format(mu, graph))
        X_train, X_test, y_train, y_test = train_test_split(x_df, median_y_df, stratify=median_y_df, test_size=0.2)
        final_folder = 'Infomap_Data/by_graph/'
        X_train.to_csv(final_folder + 'node_x_train_graph_{0}_{1}.csv'.format(mu, graph))
        X_test.to_csv(final_folder + 'node_x_test_graph_{0}_{1}.csv'.format(mu, graph))
        y_train.to_csv(final_folder + 'node_y_train_graph_{0}_{1}.csv'.format(mu, graph))
        y_test.to_csv(final_folder + 'node_y_test_graph_{0}_{1}.csv'.format(mu, graph))