In [1]:
import pandas as pd
import numpy as np
import pickle
import glob
import scipy.stats as st
from scipy.special import softmax
import datetime
from tslearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, confusion_matrix
from tqdm import tqdm

In [2]:
import pickle

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [3]:
# load topology
df_topos = load_obj('df_topos.pkl')

# Load pm_timeseries
df_pmvalues = load_obj('df_pmvalues.pkl')

In [4]:
list_ts = []
for i in range(len(df_pmvalues)):
    series = df_pmvalues.iloc[i][4:]
    z_score = st.zscore(np.asarray(series, dtype='float32'))
    list_ts.append(
        {'node': df_pmvalues.node.iloc[i], 'fac': df_pmvalues.fac.iloc[i],
         'pm': df_pmvalues.pm.iloc[i], 'raw': np.asarray(series), 'z_score': z_score})
df_ts = pd.DataFrame(list_ts)
df_ts

Unnamed: 0,node,fac,pm,raw,z_score
0,000-5acae-94_1_2_6,AMP,OPIN-OTS,"[1.6000000238418581, 1.5, 1.7999999523162842, ...","[-0.94524956, -1.31824, -0.19926906, -0.199269..."
1,000-5acae-94_1_2_6,AMP,OPOUT-OTS,"[17.5, 17.700000762939453, 17.600000381469727,...","[-0.7392669, 0.04285886, -0.34820402, -1.52139..."
2,000-5acae-94_1_2_8,AMP,OPIN-OTS,"[5.699999809265138, 5.5, 5.599999904632568, 5....","[-0.6214219, -1.2086554, -0.91503865, -1.20865..."
3,000-5acae-94_1_2_8,AMP,OPOUT-OTS,"[17.100000381469727, 16.899999618530273, 16.89...","[-0.19719592, -0.8827008, -0.8827008, -0.88270..."
4,000-5acae-94_2_2_6,AMP,OPIN-OTS,"[-2.200000047683716, -2.0, -1.8999999761581419...","[-0.37543544, 0.59471846, 1.0797955, 0.5947184..."
...,...,...,...,...,...
1805,ffc-d8aec-b9_1_2_8,AMP,OPOUT-OTS,"[2.700000047683716, 2.799999952316284, 2.29999...","[-0.25424948, 0.050015945, -1.4713126, -1.1670..."
1806,ffc-d8aec-b9_2_2_6,AMP,OPIN-OTS,"[-4.5, -4.400000095367432, -4.199999809265137,...","[-1.3617972, -1.037454, -0.38876632, -1.361797..."
1807,ffc-d8aec-b9_2_2_6,AMP,OPOUT-OTS,"[15.100000381469727, 15.300000190734865, 15.5,...","[-2.1660929, -1.6359189, -1.1057447, -1.105744..."
1808,ffc-d8aec-b9_2_2_8,AMP,OPIN-OTS,"[-6.9000000953674325, -6.699999809265138, -6.5...","[-2.5600693, -1.0282226, -0.26230112, -1.02822..."


In [5]:
df_ts.node.drop_duplicates()

0       000-5acae-94_1_2_6
2       000-5acae-94_1_2_8
4       000-5acae-94_2_2_6
6       000-5acae-94_2_2_8
8       011-40a0f-05_2_2_6
               ...        
1800    ff9-4a40e-8e_3_2_8
1802    ffc-d8aec-b9_1_2_6
1804    ffc-d8aec-b9_1_2_8
1806    ffc-d8aec-b9_2_2_6
1808    ffc-d8aec-b9_2_2_8
Name: node, Length: 913, dtype: object

In [14]:
class THRESDESCEND:
    """ Threshold descending correlation algorithm.
    """

    MODEL_FILENAME = "THRESDESCEND_model"
    

    def __init__(self, n_inter=20, threshold=0.5, corr_algor='Spearman', normalize=True, cormax=False):
        self.n_inter = n_inter
        self.threshold = threshold
        self.corr = corr_algor
        self.normalize = normalize # whether z-score normalizationis applied
        self.cormax = cormax # whether correlation matrix is available
        
    def correlated_value(self, ts_node_A, ts_node_B):
        if self.corr == 'Spearman':
            corr_values = [st.spearmanr(ts_a, ts_b)[0] for ts_a in ts_node_A for ts_b in ts_node_B]
        elif self.corr == 'Kendall':
            corr_values = [st.kendalltau(ts_a, ts_b)[0] for ts_a in ts_node_A for ts_b in ts_node_B]
        elif self.corr == 'Pearson':
            corr_values = [st.pearsonr(ts_a, ts_b)[0] for ts_a in ts_node_A for ts_b in ts_node_B]
            
        return corr_values, np.max(corr_values)
            
    
    def con_rescon(self, df_ts): # Connections Reconstruction
        nodes = df_ts.node.drop_duplicates().to_list()
        
        # Calculate the score matrix of nodes
        if self.cormax:
            print('score maxtrix is calculated in advanced')
            score_matrix = load_obj('score_matrix.pkl')
        else:
            print('calculating score matrix...')
            n = nodes.__len__()
            score_matrix = np.zeros(shape=(n, n))

            for i in range(n):
                for j in range(i, n):
                    node_A = nodes[i]
                    node_B = nodes[j]
                    if self.normalize:
                        ts_node_A = df_ts.loc[df_ts.node.isin([node_A])]['z_score']
                        ts_node_B = df_ts.loc[df_ts.node.isin([node_B])]['z_score']
                    else:
                        ts_node_A = df_ts.loc[df_ts.node.isin([node_A])]['raw']
                        ts_node_B = df_ts.loc[df_ts.node.isin([node_B])]['raw']

                    _, corr_value_max = self.correlated_value(ts_node_A, ts_node_B)
                    score_matrix[i, j] = corr_value_max

            for i in range(1, n):
                for j in range(i):
                    score_matrix[i,j] = score_matrix[j, i]
                    
            print('saving score matrix...')
            save_obj( score_matrix, 'score_matrix')
        
        # From the score matrix, reconstruct connections
            
        connections_list = []
        for n in range(self.n_inter):
            print("Iteration: ", n)
            Set_Ori = nodes.copy()
            Set_A = nodes.copy()
            np.random.shuffle(Set_A)
            Set_B = []
            thres = 1
            res_connections = []
            i = 0
            while thres >= self.threshold:
                thres = thres - 0.05
                Set_C = []
                while Set_A:
                    i = i+1
                    node_1 = Set_A[0]
                    ind = Set_Ori.index(node_1)
                    row = score_matrix[ind,:].copy()
                    inds_B = [Set_Ori.index(node) for node in Set_B]
                    inds_C = [Set_Ori.index(node) for node in Set_C]
                    row[inds_B] = float('-inf') 
                    row[inds_C] = float('-inf') 
                    row[ind] = float('-inf')
                    ind_sorted = np.argsort(row)[::-1]
                    row_sorted = np.sort(row)[::-1]
                    node_sorted = [nodes[j] for j in ind_sorted]
                    if row[ind_sorted[0]] > thres:
                        node_2 = Set_Ori[ind_sorted[0]]
                        res_connections.append({"node_1": node_1, "node_2": node_2})
                        Set_A.remove(node_1)
                        Set_A.remove(node_2)
                        Set_B.append(node_1)
                        Set_B.append(node_2)
                    else:
                        Set_A.remove(node_1)
                        Set_C.append(node_1)
                Set_A = Set_C.copy()
            connections_list.append(pd.DataFrame(res_connections))

        df_connections_all = pd.concat(connections_list, ignore_index=True)
        df_connections_sorted = pd.DataFrame({'node_1': df_connections_all.min(axis=1), 'node_2': df_connections_all.max(axis=1)}).sort_values(by='node_1')
        df_connections_counted = df_connections_sorted.value_counts().reset_index()
        df_connections_counted.rename({0: 'counts'}, axis=1, inplace=True)
        self.df_connections_counted = df_connections_counted
        
        return self.df_connections_counted
    
    def validation(self, df_topos):
        n = df_topos.__len__()
        df_connections = self.df_connections_counted[:n].drop('counts', axis=1)
        df_connections.sort_values(by='node_1', inplace=True)

        df_TP = pd.merge(df_topos, df_connections, on=['node_1', 'node_2'])
        TP = df_TP.shape[0]
        FP = df_connections.shape[0] - TP
        FN = df_topos.shape[0] - TP
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1_score = 2*Precision*Recall/(Precision+Recall)
        print("Performance metric:\n{:20}{}\n{:20}{}\n{:20}{}\n{:20}{}\n{:20}{}\n{:20}{}"
              .format('TruePositive', TP, 'FalsePositive', FP, "FalseNegative", FN,
                      'Precision', Precision, 'Recall', Recall, "F1_score", F1_score))


In [15]:
model = THRESDESCEND(n_inter=20, threshold=0.7, corr_algor='Kendall', normalize=True, cormax=False)
df_connections = model.con_rescon(df_ts)
model.validation(df_topos)

calculating score matrix...
saving score matrix...
Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Performance metric:
TruePositive        412
FalsePositive       39
FalseNegative       55
Precision           0.9135254988913526
Recall              0.8822269807280514
F1_score            0.89760348583878


In [17]:
model.df_connections_counted

Unnamed: 0,node_1,node_2,counts
0,000-5acae-94_1_2_6,9c3-bf9d8-4b_1_2_8,20
1,5f2-0a2bc-d2_1_2_8,ab1-3cc1f-2b_1_2_6,20
2,641-c78fd-3b_2_2_8,e46-30657-f9_3_2_6,20
3,641-c78fd-3b_2_2_6,e46-30657-f9_3_2_8,20
4,641-c78fd-3b_1_2_8,7f5-8ae37-d4_1_2_6,20
...,...,...,...
446,afe-b9f62-22_3_2_6,e17-276ce-2d_1_2_8,1
447,1a4-facc5-f6_1_2_8,c41-9afee-85_2_2_6,1
448,1a4-facc5-f6_1_14_8,1a4-facc5-f6_1_2_8,1
449,1d3-ddf5d-3f_2_2_6,821-6d9a7-60_2_2_8,1
