In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
#from scipy.stats import mannwhitneyu
#import collections
#from statistics import mean
#from statistics import median
from collections import defaultdict
from Levenshtein import distance

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn import metrics
#from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, adjusted_mutual_info_score, silhouette_score

import copy

In [2]:
##############################
#
# The Dataset will only be available for the review
#
##############################

log = pd.read_csv("eventsWithPhases.csv")

In [3]:
#create log with Case ID based on currentQuestion + participant

#first change data type of currentQuestion from int to str
log = log.astype({'currentQuestion': str})
log.dtypes

#combine two columns
log['case_id'] = pd.factorize(log.participant+log.currentQuestion)[0]

#there are 614 cases, although 46 participants * 14 questions = 616 --> 2 cases are missing 
#print(len(log.case_id.unique()))

In [4]:
# Define conversion function
def convert_ms_to_date(milliseconds):
    date_obj = datetime.fromtimestamp(milliseconds / 1000.0)
    date_string = date_obj.strftime('%Y-%m-%d %H:%M:%S.%f')
    return date_string

# Apply conversion function to 'milliseconds' column
log['fixation_start'] = log['Fixation Start'].apply(convert_ms_to_date)
log['fixation_end'] = log['Fixation End'].apply(convert_ms_to_date)

In [5]:
# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the 'tabName_element' column
log['activity'] = le.fit_transform(log['tabName_element'])

#Number of unique activity values
#log['activity'].value_counts()

In [6]:
#Select only Control-flow questions
log_select = log[['case_id', 'fixation_start', 'activity', 'Phase', 'Type1', 'Type2', 'Type3', 'Fixation Duration']]
log_tasks = log_select.loc[log_select['Type2'] == 'Control-flow']

In [7]:
#The experiment session with case_id 106 only contains 2 fixations and is therefore removed from the evaluation
#The error porbably occured since the participant involuntarily clicked on the 'next' button to go to the next task
log_tasks[log_tasks['case_id'] == 106]

Unnamed: 0,case_id,fixation_start,activity,Phase,Type1,Type2,Type3,Fixation Duration
31794,106,1970-01-01 01:38:40.855554,50,,Local,Control-flow,Ordering,83.316
31795,106,1970-01-01 01:38:47.950454,412,,Local,Control-flow,Ordering,66.624


In [8]:
log_tasks = log_tasks.drop(log_tasks[log_tasks['case_id'] == 106].index)
log_tasks

Unnamed: 0,case_id,fixation_start,activity,Phase,Type1,Type2,Type3,Fixation Duration
0,0,1970-01-01 02:35:15.800704,82,search,Local,Control-flow,Exclusiveness,83.4080
1,0,1970-01-01 02:35:16.708983,82,search,Local,Control-flow,Exclusiveness,124.9670
2,0,1970-01-01 02:35:16.883977,82,search,Local,Control-flow,Exclusiveness,66.6000
3,0,1970-01-01 02:35:17.900487,82,search,Local,Control-flow,Exclusiveness,83.3050
4,0,1970-01-01 02:35:18.375424,82,search,Local,Control-flow,Exclusiveness,108.3100
...,...,...,...,...,...,...,...,...
173748,610,1970-01-01 03:15:17.900596,88,,Global,Control-flow,Ordering,191.6515
173749,610,1970-01-01 03:15:18.142225,179,,Global,Control-flow,Ordering,283.3150
173750,610,1970-01-01 03:15:18.442194,179,,Global,Control-flow,Ordering,141.6530
173751,610,1970-01-01 03:15:18.642171,50,,Global,Control-flow,Ordering,191.6490


In [9]:
#log_tasks['taskType'] = log_tasks['Type1'] + log_tasks['Type3']
log_group = log_tasks.groupby(['case_id'])['Type1'].apply(list).reset_index()
log_group['task'] = log_group['Type1'].apply(lambda x: str(set(x)))

In [10]:
#Create trace log
logVar = log_tasks.groupby(['case_id'])['activity'].apply(list).reset_index()
#len(logVar["activity"][310])
logVar

Unnamed: 0,case_id,activity
0,0,"[82, 82, 82, 82, 82, 177, 82, 82, 82, 82, 82, ..."
1,4,"[82, 82, 82, 181, 82, 82, 82, 82, 82, 82, 181,..."
2,5,"[90, 82, 183, 82, 82, 82, 82, 82, 183, 77, 75,..."
3,6,"[82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 8..."
4,7,"[86, 82, 82, 82, 82, 178, 178, 82, 82, 82, 82,..."
...,...,...
344,606,"[82, 59, 120, 122, 183, 120, 120, 183, 59, 183..."
345,607,"[82, 82, 82, 102, 175, 175, 175, 175, 175, 175..."
346,608,"[86, 86, 82, 108, 109, 86, 178, 86, 178, 108, ..."
347,609,"[82, 195, 195, 195, 195, 195, 195, 195, 195, 1..."


In [11]:
logVar["length"] = logVar["activity"].apply(lambda x: len(x))
logVar['length'].max()

1152

In [12]:
logVar['length'].min()

27

In [13]:
log_group

Unnamed: 0,case_id,Type1,task
0,0,"[Local, Local, Local, Local, Local, Local, Loc...",{'Local'}
1,4,"[Global, Global, Global, Global, Global, Globa...",{'Global'}
2,5,"[Global, Global, Global, Global, Global, Globa...",{'Global'}
3,6,"[Local, Local, Local, Local, Local, Local, Loc...",{'Local'}
4,7,"[Local, Local, Local, Local, Local, Local, Loc...",{'Local'}
...,...,...,...
344,606,"[Global, Global, Global, Global, Global, Globa...",{'Global'}
345,607,"[Local, Local, Local, Local, Local, Local, Loc...",{'Local'}
346,608,"[Local, Local, Local, Local, Local, Local, Loc...",{'Local'}
347,609,"[Local, Local, Local, Local, Local, Local, Loc...",{'Local'}


In [14]:
trace_dis = log_group.groupby(['task'])['case_id'].apply(list).reset_index()
trace_dis['case_id'].str.len()

0    175
1    174
Name: case_id, dtype: int64

## Evaluation based on Nearest Neighbor

In [15]:
def NearestNeighbor(matrix, labelDict):
    
    clusterLabel = set(labelDict.values())
    clusterMeasuredCount = dict.fromkeys(list(clusterLabel), 0)
    
    for i in range(len(matrix)):
        #delete 0 in array matrix[i] for distance between (i,i)
        x = np.delete(matrix[i], i) 
        #identify min distance/value in array
        y = min(x)
        
        #identify position of y AND select first position/pair appearing in the array in case there are muliple pairs with identical min distance
        nearestNeighbor = np.where(x == y)[0] #problem if multiple positions??
        nearestNeighbor = int(nearestNeighbor[0])
        
        #true Label for i
        trueLabel = labelDict[i]
        
        #for label comparison add +1 to dict position if position occurs after i (because of the deletion of 0 at the beginning)
        if nearestNeighbor >= i:
            if trueLabel == labelDict[nearestNeighbor + 1]:
                clusterMeasuredCount[trueLabel] += 1         
        else:
            if trueLabel == labelDict[nearestNeighbor]:
                clusterMeasuredCount[trueLabel] += 1
    
    #Count the (true) number of traces per label/attribute
    clusterTrueCount = {}
    for i in clusterLabel:
        clusterTrueCount[i] = list(labelDict.values()).count(i)
    
    #Divide number of nearest neighbours with identical label by the respective (true number) of traces with this label
    metric = 0
    for i in clusterLabel:
        metric += clusterMeasuredCount[i] / clusterTrueCount[i]
        
    #print(clusterMeasuredCount, clusterTrueCount, metric)
    
    #print(clusterMeasuredCount)
    #print(clusterTrueCount)
    #print(clusterLabel)
    #print(metric)
    return metric / len(clusterLabel)


'''
x = np.array([[ 0, 12,  8, 10, 12],
       [5,  0,  9, 14, 17],
       [ 8,  9,  0, 12, 10],
       [ 8,  9,  1, 0, 10],
       [ 8,  9,  5, 4, 0]])

labelDict2 = {0:1,1:2,2:1,3:3,4:3}
NearestNeighbor(x, labelDict2)
'''

'\nx = np.array([[ 0, 12,  8, 10, 12],\n       [5,  0,  9, 14, 17],\n       [ 8,  9,  0, 12, 10],\n       [ 8,  9,  1, 0, 10],\n       [ 8,  9,  5, 4, 0]])\n\nlabelDict2 = {0:1,1:2,2:1,3:3,4:3}\nNearestNeighbor(x, labelDict2)\n'

## Evaluation based on Precision@k

In [16]:
def get_key_by_value(dictionary, value):
    for key, val in dictionary.items():
        if val == value:
            return key
    #return None  # Return None if the value is not found in the dictionary




def PrecisionAtK(matrix, labelDict, k):
    
    clusterLabel = set(labelDict.values())
    clusterMeasuredCount = dict.fromkeys(list(clusterLabel), 0)
    
    
    for i in range(len(matrix)):
        
        #true Label for i
        trueLabel = labelDict[i]

        
        #delete 0 in array matrix[i] for distance between (i,i)   
        x = np.delete(matrix[i], i)
        
        #get all k minimum values
        nnValues = []
        for l in range(k):
            y = min(x)
            nnValues.append(y)
            x = np.delete(x, np.where(x == y)[0][0])
        
        
        #get location of minimum values
        z = np.delete(matrix[i], i)
        
        #create dict from array with {position:value}
        my_dict = {}
        for m in range(len(z)):
            my_dict[m] = z[m]
        
        key_list = []
        for n in nnValues:
            key = get_key_by_value(my_dict, n)
            key_list.append(key)
            del my_dict[key]
        
        for o in range(k):
            position = list(key_list)[o]

            if position >= i:
                if trueLabel == labelDict[position + 1]:
                    clusterMeasuredCount[trueLabel] += 1

            else:
                if trueLabel == labelDict[position]:
                    clusterMeasuredCount[trueLabel] += 1

            #transform array to dict
            #get key value from dict
            #compare label
            #remove key+value from dict

            #Need exception in case y == 0 ??
            
    
    #Count the (true) number of traces per label/attribute
    clusterTrueCount = {}
    for i in clusterLabel:
        clusterTrueCount[i] = list(labelDict.values()).count(i)
    
    #Divide number of nearest neighbours with identical label by the respective (true number) of traces with this label
    metric = 0
    for i in clusterLabel:
        metric += clusterMeasuredCount[i] / k / clusterTrueCount[i]
        
    #print(clusterMeasuredCount, clusterTrueCount, metric)
    
    #print(clusterLabel)
    #print(metric)
    return metric / len(clusterLabel)


'''
x3 = np.array([[ 0, 12,  8, 10, 12],
       [5,  0,  9, 14, 17],
       [ 8,  9,  0, 12, 10],
       [ 8,  9,  1, 0, 10],
       [ 4,  8,  4, 4, 0]]) # --> Issue: How to select NN when identical distance values !!!

labelDict3 = {0:1,1:2,2:1,3:3,4:3}
PrecisionAtK(x3, labelDict3, 2)
'''

'\nx3 = np.array([[ 0, 12,  8, 10, 12],\n       [5,  0,  9, 14, 17],\n       [ 8,  9,  0, 12, 10],\n       [ 8,  9,  1, 0, 10],\n       [ 4,  8,  4, 4, 0]]) # --> Issue: How to select NN when identical distance values !!!\n\nlabelDict3 = {0:1,1:2,2:1,3:3,4:3}\nPrecisionAtK(x3, labelDict3, 2)\n'

## Triplet

see: https://towardsdatascience.com/triplet-loss-advanced-intro-49a07b7d8905

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [18]:
def get_triplet_mask(labels):
    
    # step 1 - get a mask for distinct indices
    ###print('labels', labels)
    # shape: (batch_size, batch_size)
    indices_equal = torch.eye(labels.size()[0], dtype=torch.bool, device=labels.device)
    ###print('equal', indices_equal)
    indices_not_equal = torch.logical_not(indices_equal)
    ###print('not_equal', indices_not_equal)

    # shape: (batch_size, batch_size, 1)
    i_not_equal_j = indices_not_equal.unsqueeze(2)
    ###print('i_not_j - unsqueeze2', i_not_equal_j)
    # shape: (batch_size, 1, batch_size)
    i_not_equal_k = indices_not_equal.unsqueeze(1)
    ###print('i_not_k - unsqueeze1', i_not_equal_k)
    # shape: (1, batch_size, batch_size)
    j_not_equal_k = indices_not_equal.unsqueeze(0)
    ###print('j_not_k - unsqueeze0', i_not_equal_k)
    # Shape: (batch_size, batch_size, batch_size)
    distinct_indices = torch.logical_and(torch.logical_and(i_not_equal_j, i_not_equal_k), j_not_equal_k)
    ###print('distinct!!!!', distinct_indices)

    # step 2 - get a mask for valid anchor-positive-negative triplets
    # shape: (batch_size, batch_size)
    labels_equal = labels.unsqueeze(0) == labels.unsqueeze(1)
    ###print('labels_equal', labels_equal)
    # shape: (batch_size, batch_size, 1)
    i_equal_j = labels_equal.unsqueeze(2)
    ###print('i_equal_j', i_equal_j)
    # shape: (batch_size, 1, batch_size)
    i_equal_k = labels_equal.unsqueeze(1)
    ###print('i_equal_k', i_equal_k)
    # shape: (batch_size, batch_size, batch_size)
    valid_indices = torch.logical_and(i_equal_j, torch.logical_not(i_equal_k))
    ###print('valid_indices!!!', valid_indices)
    

    # step 3 - combine two masks
    mask = torch.logical_and(distinct_indices, valid_indices)
    ###print('mask!!', mask)
    return mask

    """compute a mask for valid triplets
    Args:
        labels: Batch of integer labels. shape: (batch_size,)
    Returns:
        Mask tensor to indicate which triplets are actually valid. Shape: (batch_size, batch_size, batch_size)
        A triplet is valid if:
        `labels[i] == labels[j] and labels[i] != labels[k]`
        and `i`, `j`, `k` are different.
    """
    
class custom_activation(nn.Module):
    def __init__(self):
        super(custom_activation, self).__init__()
    
    def forward(self, x):
        x[x>0] = 1
        x[x<=0] = 0
        return x


class BatchAllTtripletLoss(nn.Module):
  """Uses all valid triplets to compute Triplet loss

  Args:
    margin: Margin value in the Triplet Loss equation
  """
  def __init__(self, margin=0): #default margin = 0
    super().__init__()
    self.margin = margin
    self.relu = nn.ReLU() #new
    self.custom = custom_activation()
    
  def forward(self, distance_matrix, labels):
    """computes loss value.

    Args:
      embeddings: Batch of embeddings, e.g., output of the encoder. shape: (batch_size, embedding_dim)
      labels: Batch of integer labels associated with embeddings. shape: (batch_size,)

    Returns:
      Scalar loss value.
    """
    # step 1 - convert to tensor format
    distance_matrix = torch.tensor(distance_matrix)
    labels = torch.tensor(list(labels.values()))


    # step 2 - compute loss values for all triplets by applying broadcasting to distance matrix

    # shape: (batch_size, batch_size, 1)
    anchor_positive_dists = distance_matrix.unsqueeze(2)
    # shape: (batch_size, 1, batch_size)
    anchor_negative_dists = distance_matrix.unsqueeze(1)
    # get loss values for all possible n^3 triplets
    # shape: (batch_size, batch_size, batch_size)
    triplet_loss = anchor_negative_dists - anchor_positive_dists + self.margin
    ###print('tl0',triplet_loss)

    # step 3 - filter out invalid or easy triplets by setting their loss values to 0

    # shape: (batch_size, batch_size, batch_size)
    mask = get_triplet_mask(labels)
    ###print('mask', mask)
    triplet_loss *= mask
    ###print(triplet_loss)
    ###print('tl1:', triplet_loss)
    # easy triplets have negative loss values
    
    triplet_loss = self.custom(triplet_loss)
    ###print(triplet_loss)
    #triplet_loss = F.relu(triplet_loss)

    # step 4 - compute scalar loss value by averaging positive losses
    
    triLossNonZero = (triplet_loss != 0).nonzero(as_tuple=True)
    labelTorchUnique = torch.unique(labels, return_counts=True)
    
    nonZero = len(triLossNonZero[0])
    triLossSum = []
    for i in range(nonZero):
        #Identify L_a --> In Class
        t1 = triLossNonZero[0][i]
        labelIn = labels[t1]
        positionIn = int((labelTorchUnique[0] == labelIn).nonzero(as_tuple=False))
        countIn = labelTorchUnique[1][positionIn]

        #Identify L_b --> Out Class
        t3 = triLossNonZero[2][i]
        labelOut = labels[t3]
        positionOut = int((labelTorchUnique[0] == labelOut).nonzero(as_tuple=False))
        countOut = labelTorchUnique[1][positionOut]

        #Calculate loss
        value = (1/countIn)*(1/countIn)*(1/countOut)  
        ###print(countIn)
        ###print(countOut)
        triLossSum.append(value)
    
    #finally divide by |A|^2-|A|
    A = len(labelTorchUnique[0])  
    lossValue = sum(triLossSum) / (A*A-A)
        
    #OLD
    #E_triplet = (1 / (A^2 - A)) *
    #num_positive_losses = (triplet_loss > eps).float().sum()
    #print(num_positive_losses)
    #print(triplet_loss.sum())
    #triplet_loss = triplet_loss.sum() / (num_positive_losses + eps)
    

    return lossValue

In [19]:
#distance_matrix = torch.tensor(Matrix)
#labels = torch.tensor(list(labelDict1.values()))
#distance_matrix
#labels
#triLossNonZero = (triplet_loss != 0).nonzero(as_tuple=True)
#labelTorchUnique = torch.unique(labels, return_counts=True)
#len(labelTorchUnique[0])

## Silhouette

In [20]:
from sklearn import metrics

def Silhouette(distMatrix, labelDict):
    labelDictList = list(labelDict.values())
    return metrics.silhouette_score(distMatrix, labelDictList)

## Ground truth comparison

In [21]:
#Create dictionary with true labels
log_group['label'] = le.fit_transform(log_group['task'])
labelDict1 = log_group['label'].to_dict()
#labelDict1

In [22]:
logVar["c:n_chr"] = logVar["activity"].apply(lambda x: [chr(i) for i in x])
logVar["strings"] = logVar["c:n_chr"].apply(lambda x: ''.join(x))

In [23]:
def matrix_calc(features, distance):
    n = len(features)
    dist_matrix = np.zeros((n,n))
    
    for i in range(n):
        for j in range(i, n):
            dist_matrix[i,j] = distance(features[i], features[j])
            dist_matrix[j,i] = dist_matrix[i,j]
    
    return dist_matrix

In [24]:
def results(DistMatrix):
    print('NN:   ' + str(NearestNeighbor(DistMatrix, labelDict1)))
    print('P@10: ' + str(PrecisionAtK(DistMatrix, labelDict1, 10)))

    triplet = BatchAllTtripletLoss()
    print('Tri:  ' + str(triplet.forward(DistMatrix,labelDict1)))
    print('Sil:  ' + str(Silhouette(DistMatrix, labelDict1)))

### Levenshtein Distance

In [25]:
#Levenshtein Distance

List = list(logVar["strings"])

dist_matrix = np.zeros((len(List),len(List)),dtype=int)

for i in range(0,len(List)):
    for j in range(0,len(List)):
        dist_matrix[i,j] = distance(List[i],List[j])

lev_dis = dist_matrix
lev_dis

array([[  0, 267, 316, ..., 263, 274, 332],
       [267,   0, 311, ..., 188, 180, 311],
       [316, 311,   0, ..., 333, 307, 319],
       ...,
       [263, 188, 333, ...,   0, 194, 345],
       [274, 180, 307, ..., 194,   0, 351],
       [332, 311, 319, ..., 345, 351,   0]])

In [26]:
results(lev_dis)

NN:   0.939983579638752
P@10: 0.8513070607553366
Tri:  tensor(0.5836)
Sil:  0.21810578792076235


In [18]:
###OLD
'''
Matrix = lev_dis

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.937142857142857
P@10: 0.850857142857143
Tri:  tensor(0.5911)
Sil:  0.21990849399232068


### Normalized Levenshtein Distance

In [27]:
List = logVar["strings"]

n = len(List)
dist_matrix = np.zeros((n,n))    # initialize distance matrix to a square of zeros

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = distance(List[i], List[j]) / max(len(List[i]),len(List[j]))
        dist_matrix[j,i] = dist_matrix[i,j]       # for the symmetric part, no computation

lev_dis_norm = dist_matrix

In [28]:
results(lev_dis_norm)

NN:   0.9714121510673235
P@10: 0.9167783251231527
Tri:  tensor(0.6128)
Sil:  0.06023860382402081


In [33]:
###OLD
'''
Matrix = lev_dis_norm

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9685714285714286
P@10: 0.9157142857142857
Tri:  tensor(0.6184)
Sil:  0.05932236364558842


### Cosine based on 1-gram

In [29]:
#Create 1-gram

def createVector(charList):
    #dtype = [('structure', 'S10'), ('relfrequ', float)]
    arrayList = np.array(charList)
    unique, counts = np.unique(arrayList, return_counts=True)
    #calculate relative frequency
    relFrequList = np.array((unique, counts)).T
    uniqueList = list(unique)
    return relFrequList[relFrequList[:, 0].argsort()]
    #check completeness
    #if 'tree' not in uniqueList:
        #relFrequList = np.append(relFrequList, np.array([['tree', 0]]), axis=0)
        #print(relFrequList)

        
#Change data format from string to list of unique characters
logVar["1-gram"] = logVar["c:n_chr"].apply(lambda x: createVector(tuple(x)))
#logVar


In [30]:
def alignArrays(array1, array2):
    commonSet = set(array1[:,0]).union(array2[:,0])
    
    for i in commonSet:
        if i not in array1[:,0]:
            array1 = np.append(array1, np.array([[i, '0']]), axis=0)
        if i not in array2[:,0]:
            array2 = np.append(array2, np.array([[i, '0']]), axis=0)
    return array1[array1[:, 0].argsort()], array2[array2[:, 0].argsort()]

In [29]:
#One-hot embedding
'''
def alignArrays_OneHotEmbedding(array1, array2):
    
    for i in range(len(array1)):
        if int(array1[:,1][i]) > 1:
            array1[:,1][i] = 1
    
    for i in range(len(array2)):
        if int(array2[:,1][i]) > 1:
            array2[:,1][i] = 1
               
    commonSet = set(array1[:,0]).union(array2[:,0])
    #print(commonSet)
        
    for i in commonSet:
        if i not in array1[:,0]:
            array1 = np.append(array1, np.array([[i, '0']]), axis=0)
        if i not in array2[:,0]:
            array2 = np.append(array2, np.array([[i, '0']]), axis=0)
    return array1[array1[:, 0].argsort()], array2[array2[:, 0].argsort()]
'''

In [31]:
from scipy.spatial import distance

def cosineDist(frequVector1, frequVector2):
    Vector1, Vector2 = alignArrays(frequVector1, frequVector2)
    a = Vector1[:,1].astype(int)
    b = Vector2[:,1].astype(int)
    dist_matrix = distance.cosine(a, b)
    return dist_matrix

In [40]:
#Cosine distance based on 1-gram

#cos1_dis = matrix_calc(logVar["1-gram"],cosineDist)
results(cos1_dis)

NN:   0.9799507389162561
P@10: 0.9570607553366174
Tri:  tensor(0.5652)
Sil:  0.07706030961601126


In [31]:
#Cosine distance based on 1-gram
'''
listVec = logVar["1-gram"]

n = len(listVec)
dist_matrix = np.zeros((n,n))

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = cosineDist(listVec[i], listVec[j])
        dist_matrix[j,i] = dist_matrix[i,j]  
        
cos1_dis = dist_matrix

In [44]:
'''
Matrix = cos1_dis

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9885714285714285
P@10: 0.96
Tri:  tensor(0.5966)
Sil:  0.07511495833799511


### Cosine based on 2-gram

In [33]:
#Change data format from string to list of unique characters
#logVar["charList"] = logVar["trace_variant"].apply(lambda x: list(x))
#logVar

def df_list(list_of_char):
    extList = list_of_char.copy()
    extList.insert(0, '*') 
    extList.append('$')
    list_new = []
    for i in range(len(extList)):
        new = ''.join(extList[i:i+2])
        list_new.append(new)
    del list_new[-1]
    return list_new

#Change data format from string to list of unique characters
logVar["dfList"] = logVar["c:n_chr"].apply(lambda x: df_list(x))
#logVar

logVar["2-gram"] = logVar["dfList"].apply(lambda x: createVector(x))
#logVar

In [34]:
#Cosine distance based on 2-gram

cos2_dis = matrix_calc(logVar["2-gram"],cosineDist)
results(cos2_dis)

NN:   0.965632183908046
P@10: 0.900376026272578
Tri:  tensor(0.5557)
Sil:  0.04190006376468889


In [25]:
#Cosine distance based on 2-gram
'''
listVec = logVar["2-gram"]

n = len(listVec)
dist_matrix = np.zeros((n,n))

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = cosineDist(listVec[i], listVec[j])
        dist_matrix[j,i] = dist_matrix[i,j]  
        
cos2_dis = dist_matrix

In [46]:
'''
Matrix = cos2_dis

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9914285714285714
P@10: 0.9825714285714285
Tri:  tensor(0.6086)
Sil:  0.07824155855586741


In [33]:
#Aggregation
'''
aggregate = cos1_dis + cos2_dis

Matrix = aggregate

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9971428571428571
P@10: 0.978
Tri:  tensor(0.6072)
Sil:  0.08113738485276242


In [35]:
#Create 3-gram

#Change data format from string to list of unique characters
logVar["charList"] = logVar["c:n_chr"].apply(lambda x: list(x))

def df_list2(list_of_char):
    extList = list_of_char.copy()
    extList.insert(0, '*') 
    extList.append('$')
    list_new = []
    for i in range(len(extList) - 1):
        new = ''.join(extList[i:i+3])
        list_new.append(new)
    del list_new[-1]
    return list_new

In [36]:
logVar["dfList2"] = logVar["charList"].apply(lambda x: df_list2(x))
logVar["3-gram"] = logVar["dfList2"].apply(lambda x: createVector(x))

In [37]:
#Cosine distance based on 3-gram

cos3_dis = matrix_calc(logVar["3-gram"],cosineDist)
results(cos3_dis)

NN:   0.9170607553366175
P@10: 0.8345566502463053
Tri:  tensor(0.5331)
Sil:  0.021983474242203347


In [38]:
#Aggregation

aggregate1 = cos1_dis + cos2_dis
results(aggregate1)

NN:   0.9770935960591133
P@10: 0.9381658456486042
Tri:  tensor(0.5611)
Sil:  0.05977036300162498


In [39]:
#Aggregation

aggregate2 = cos1_dis + cos2_dis + cos3_dis
results(aggregate2)

NN:   0.9684893267651888
P@10: 0.9109753694581282
Tri:  tensor(0.5561)
Sil:  0.04699491340323087


## Euclidean Distance

In [41]:
# Euclidean distance
# see https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy

def euclidDist(frequVector1, frequVector2):
    Vector1, Vector2 = alignArrays(frequVector1, frequVector2)
    a = Vector1[:,1].astype(float)
    b = Vector2[:,1].astype(float)
    euclidean_dist = np.linalg.norm(a-b)
    return euclidean_dist

In [42]:
#Euclidean distance based on 1-gram

euc1_dis = matrix_calc(logVar["1-gram"],euclidDist)
results(euc1_dis)

NN:   0.9742528735632183
P@10: 0.9156026272577997
Tri:  tensor(0.5799)
Sil:  0.12313737064490321


In [43]:
#Euclidean distance based on 2-gram

euc2_dis = matrix_calc(logVar["2-gram"],euclidDist)
results(euc2_dis)

NN:   0.9456321839080459
P@10: 0.800888341543514
Tri:  tensor(0.5402)
Sil:  0.07896608635287823


In [44]:
#Euclidean distance based on 3-gram

euc3_dis = matrix_calc(logVar["3-gram"],euclidDist)
results(euc3_dis)

NN:   0.8110344827586207
P@10: 0.672704433497537
Tri:  tensor(0.5131)
Sil:  0.05959731752325882


In [45]:
#Aggregation

agg_euc = euc1_dis + euc2_dis
results(agg_euc)

NN:   0.9685221674876847
P@10: 0.8864302134646962
Tri:  tensor(0.5662)
Sil:  0.1069272854571777


### Jaccard based on 1-gram

In [46]:
#Distance based on activity type

def jaccard_similarity(list1, list2):
    s1, s2 = set(list1), set(list2)
    return 1 - len(s1 & s2) / len(s1 | s2)

In [47]:
#Jaccard based on 1-gram
Jacc1_dis = matrix_calc(logVar["charList"],jaccard_similarity)
results(Jacc1_dis)

NN:   0.979983579638752
P@10: 0.9519261083743842
Tri:  tensor(0.6072)
Sil:  0.07999095734358981


In [48]:
#Jaccard based on 2-gram
Jacc2_dis = matrix_calc(logVar["dfList"],jaccard_similarity)
results(Jacc2_dis)

NN:   0.9913957307060756
P@10: 0.9816600985221675
Tri:  tensor(0.6266)
Sil:  0.04940525630137409


In [49]:
#Jaccard based on 3-gram
Jacc3_dis = matrix_calc(logVar["dfList2"],jaccard_similarity)
results(Jacc3_dis)

NN:   0.9942528735632183
P@10: 0.9868259441707719
Tri:  tensor(0.6294)
Sil:  0.018640554706811936


In [51]:
#Aggregation

agg_jacc1 = Jacc1_dis + Jacc2_dis
results(agg_jacc1)

NN:   0.9971428571428571
P@10: 0.9682249589490969
Tri:  tensor(0.6185)
Sil:  0.0714694571964214


In [52]:
agg_jacc2 = Jacc1_dis + Jacc2_dis + Jacc3_dis
results(agg_jacc2)

NN:   0.9942857142857142
P@10: 0.9762331691297209
Tri:  tensor(0.6242)
Sil:  0.05754668243539865


In [42]:
'''
ListChar = list(logVar["activity"])

n = len(ListChar)
dist_matrix = np.zeros((n,n))    # initialize distance matrix to a square of zeros

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = jaccard_similarity(ListChar[i], ListChar[j])
        dist_matrix[j,i] = dist_matrix[i,j]       # for the symmetric part, no computation
        
jacc1_dis = dist_matrix

In [37]:
'''
Matrix = jacc1_dis

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.98
P@10: 0.9511428571428571
Tri:  tensor(0.6128)
Sil:  0.07995137880520391


### Jaccard based on 2-gram

In [43]:
#Distance based on adjacency relations

#Create list of directly follow relations
'''
def df_list(list_of_char):
    extList = list_of_char.copy()
    extList.insert(0, '*') 
    extList.append('$')
    list_new = []
    for i in range(len(extList)):
        new = ''.join(extList[i:i+2])
        list_new.append(new)
    del list_new[-1]
    return list_new

In [44]:
#Change data format from string to list of unique characters
#logVar["dfList"] = logVar["c:n_chr"].apply(lambda x: df_list(x))
#logVar

In [45]:
'''
ListChar = list(logVar["dfList"])

n = len(ListChar)
dist_matrix = np.zeros((n,n))    # initialize distance matrix to a square of zeros

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = jaccard_similarity(ListChar[i], ListChar[j])
        dist_matrix[j,i] = dist_matrix[i,j]       # for the symmetric part, no computation
        
jacc2_dis = dist_matrix

In [41]:
'''
Matrix = jacc2_dis

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9885714285714287
P@10: 0.9805714285714286
Tri:  tensor(0.6300)
Sil:  0.04925171804531398


In [46]:
#Aggregation
'''
aggregate = jacc1_dis + jacc2_dis

Matrix = aggregate

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9971428571428571
P@10: 0.9674285714285713
Tri:  tensor(0.6240)
Sil:  0.07137964234907106


## Graph based measures

In [None]:
#Now consider edge types

In [53]:
def intEncoder(character_List):
    return [np.where(np.array(list(dict.fromkeys(character_List)))==e)[0][0]for e in character_List]

logVar["intList"] = logVar["activity"].apply(lambda x: intEncoder(x))

In [54]:
# 2. transfer intList to int_tupleList

#Create tuple lists
def tuple_list(list_of_encodedActivities):
    #list.insert(0, '*')
    #list.append('*')
    list_new = []
    last_element = list_of_encodedActivities[-1]
    for i in range(len(list_of_encodedActivities)):
        new = tuple(list_of_encodedActivities[i:i+2])
        list_new.append(new)
    del list_new[-1]
    if list_of_encodedActivities.count(last_element) == 1: #check wether last activity in trace has some adjancency relation
        list_new.append((last_element,)) ### NOT Correct
    return list_new

#q = [0,0,0,0,1,1,2,3,4,5,3,2,4,0,5,6]
#tuple_list(q)

logVar["int_tupleList"] = logVar["intList"].apply(lambda x: tuple_list(x))
#logVar["int_tupleList"]

In [55]:
# 3. generate Adjacency List

def adj_list(list_of_tuples):
    adj_list_new = {}
    try:
        for node1, node2 in list_of_tuples:
            #print(node1, node2)
            if node1 not in adj_list_new:
                newlist = []
                newlist.append(node2)
                adj_list_new[node1] = newlist
                #print(adj_list3)
        
            else:
                if node2 not in adj_list_new[node1]:
                    #mylist.extend(adj_list3[node1])
                    adj_list_new[node1].append(node2)
                    #print(adj_list3)
                    #adj_list3[node1] = mylist
    
    #in case activity has no adjacent activity - only possible for last activity --> tuple: (lastAct,)
    except ValueError as ve:
        lastValue = list_of_tuples[-1][0] 
        adj_list_new[lastValue] = list()
    return list(adj_list_new.values())

#q = [0,0,0,0,1,1,2,3,4,5,3,2,4,0,5,6]
#l = tuple_list(q)
#adj_list(l)

logVar["int_adjList"] = logVar["int_tupleList"].apply(lambda x: adj_list(x))
#logVar["int_adjList"]

In [56]:
#Now consider length

from collections import deque

def bfs_4(graph, start, end):
    
    graph = {v: k for v, k in enumerate(graph)}
    #print(start, end)
    queue = deque([(start, 0)])
    seen = set()
    while queue:
        #print(queue)
        node, distance = queue.popleft()
        #if not node:
            #print(start, end, queue)
            #print("GRAPH LIST", graph)
        if node in seen:
            continue
        seen.add(node)
        if node == end:
            return distance 
        for adjacent in graph.get(node, []):
            queue.append((adjacent, distance + 1))
        
#x = {0: [0, 1], 1: [2, 1, 0, int], 2:[2], [3: [1, 5, 3, 7], 4: [3], 5: [6, 5], 6: [1, 7], 7: [8, 9, 7], 8: [5, 8, 10], 9: [3]}
#y = [[0, 1, 5], [1, 2], [3, 4], [4, 2], [5, 0], [3, 6], []]
#bfs_4(y, 1, 6)

In [57]:
from collections import defaultdict, deque

def reverse_graph(graph):
    reversed_graph = defaultdict(list)
    for node in graph:
        for neighbor in graph[node]:
            reversed_graph[neighbor].append(node)
    return reversed_graph


def bfs_5(graph, start, end):
    queue = deque([(start, 0)])
    seen = set()
    visited = {}
    while queue:
        node, distance = queue.popleft()
        if node in seen:
            continue
        seen.add(node)
        if node == end: # maybe quicker if adjacent directly checked
            return visited
        for adjacent in graph.get(node, []):
            queue.append((adjacent, distance + 1))
            if adjacent not in visited:
                visited.update({adjacent:distance})

            
def common_ancestors(graph, node1, node2): 
    #remove cross type edge between node1 and node2
    graph[node1].remove(node2) #maybe issue, if graph is changed and not copy of graph?
    graph = {v: k for v, k in enumerate(graph)}
    graphReverse = reverse_graph(graph)
    setNode1 = bfs_5(graphReverse, node1, 0)
    setNode2 = bfs_5(graphReverse, node2, 0)
    if next((a for a in list(setNode1) if a in list(setNode2)), None) == None:
        firstCommonAnces = next((a for a in list(setNode2) if a in list(setNode1)), None)
    else:
        firstCommonAnces = next((a for a in list(setNode1) if a in list(setNode2)), 0)
    
    #uses a hash map to identify the first common ancestor in both lists
    #looks for the first common ancestor in setNode1, which can also be found in setNode2 
    #--> this might not be the closest distance between setNode1 and setNode2
    #--> e.g., for x= [0,1,3,7,5,6] and y= [4,5,7,8,3] 7 might be closest ancestor, although algo detects 3 !
    #distance = setNode1[firstCommonAnces] + setNode2[firstCommonAnces]
    
    
    if firstCommonAnces != None:   # ISSUE: in some cases the firstCommonAnces cannot be detected!
        ancesDistNode1 =  setNode1[firstCommonAnces] + 1 #the edge from node1 to first parent is counted as 0 by algorithm, therefore +1
        ancesDistNode2 =  setNode2[firstCommonAnces] + 1
        numberSkips = abs(ancesDistNode1 - ancesDistNode2)
        numberCross = min(ancesDistNode1, ancesDistNode2)
    else:
        numberSkips, numberCross = (0,1)
    return numberSkips, numberCross
    #if all(x in crossType for x in i):
    

    

#graphList = [[1], [2, 4, 1], [3, 2, 1], [], [5, 4], [5, 4, 6], [7], []]
#c = [[1, 4], [2], [3], [0, 5], [3, 5], []]
#c2 = {v: k for v, k in enumerate(c)}
#common_ancestors(c, 4, 5)
#reverse_graph(c2)

In [58]:
#Create List for decoding traces
from collections import OrderedDict
logVar["indexList"] = logVar["activity"].apply(lambda x: list(OrderedDict.fromkeys(x)))

### Cosine Edge Type + length (no df relations)

In [61]:
class Graph1:
    # instance variables
    def __init__(self, graph_list2, indexList):
        # v is the number of nodes/vertices
        self.time = 0
        self.traversal_array = []
        self.structural_array = [['tree', 0]]
        #self.structural_array = []
        self.graph_list = graph_list2
        self.v = len(graph_list2)
        self.indexList = indexList

    # function for dfs
    def dfs(self):
        self.start_time = [-1]*self.v
        self.end_time = [-1]*self.v
 
        for node in range(self.v):
            if self.start_time[node] == -1:
                self.traverse_dfs(node)
                
        return np.array(self.structural_array)
        #print()
        #print("DFS Traversal: ", self.traversal_array)
        #print()
 
    def traverse_dfs(self, node):
        self.traversal_array.append(node)
        # get the starting time
        self.start_time[node] = self.time
        self.time += 1
        # traverse through the neighbours
        for neighbour in self.graph_list[node]:

            # when the neighbor was not yet visited
            if self.start_time[neighbour] == -1:                
                self.structural_array[0][1] += 0
                self.traverse_dfs(neighbour)
                
            # otherwise when the neighbour's visit is still ongoing:
            elif self.end_time[neighbour] == -1:
                if node == neighbour:
                    self.structural_array.append(['1back ',1])
                    #self.structural_array.append(['back ',1])
                    #self.structural_array.append([str(1)+'b'])
                
                elif node in self.graph_list[neighbour]:
                    self.structural_array.append(['2back ',2])
                    #self.structural_array.append(['back ',2])
                    #self.structural_array.append(str(2)+'b')
                    
                else:
                    x = bfs_4(self.graph_list, neighbour, node)
                    self.structural_array.append([str(x+1)+'back ',x+1])
                    #self.structural_array.append(['back ',x+1])
                    #self.structural_array.append(str(x+1)+'b')
                
            # otherwise when the neighbour's visit started before the current node's visit:
            elif self.start_time[node] < self.start_time[neighbour]:
                graph_list_copy = copy.deepcopy(self.graph_list)
                graph_list_copy[node].remove(neighbour)
                y = bfs_4(graph_list_copy, node, neighbour)
                self.structural_array.append([str(y-1)+'forward ',y-1])
                #self.structural_array.append(['forward ' ,y-1])
                #self.structural_array.extend((y-1)*['forward']) # -1 to exclude one edge: (A:B,C;B:C;C:[]) ...the dist A --> C is 2 without forward edge, but we are skipping only one activity
                #self.structural_array.append(str(y-1)+'f')
                
            else:
                #Possibly first check, whether two nodes connected by cross-type have identical parent
                numberSkips, numberCross = common_ancestors(self.graph_list, node, neighbour)
                #self.structural_array.append(['forward ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]),numberSkips])
                self.structural_array.append([str(numberCross)+'cross ',numberCross])
                #self.structural_array.append(['cross ' ,numberCross])
                #self.structural_array.append(str(numberSkips)+'f')
                #self.structural_array.append(str(numberCross)+'c')
    
        # Indentation corrected:
        self.end_time[node] = self.time
        self.time += 1

In [62]:
'''
logVar["int_strucLengthList2"] = logVar.apply(lambda x: Graph1(x.int_adjList, x.indexList).dfs(), axis =1)

logVar["relFrequVec2"] = logVar["int_strucLengthList2"].apply(lambda x: pd.DataFrame(x, columns=['String', 'Value']))
logVar["relFrequVec2"] = logVar["relFrequVec2"].apply(lambda x: x.groupby('String', as_index=False)['Value'].sum())
logVar["relFrequVec2"] = logVar["relFrequVec2"].apply(lambda x: x.to_numpy())
'''

'\nlogVar["int_strucLengthList2"] = logVar.apply(lambda x: Graph1(x.int_adjList, x.indexList).dfs(), axis =1)\n\nlogVar["relFrequVec2"] = logVar["int_strucLengthList2"].apply(lambda x: pd.DataFrame(x, columns=[\'String\', \'Value\']))\nlogVar["relFrequVec2"] = logVar["relFrequVec2"].apply(lambda x: x.groupby(\'String\', as_index=False)[\'Value\'].sum())\nlogVar["relFrequVec2"] = logVar["relFrequVec2"].apply(lambda x: x.to_numpy())\n'

In [63]:
from collections import Counter

def transform_list_of_pairs(pairs):
    return [pair[0] for pair in pairs]



def count_entries(input_list):
    # Count the occurrences of each unique entry in the list
    counter = Counter(input_list)
    
    # Create a NumPy array from the counter dictionary
    result = np.array([[key, count] for key, count in counter.items()], dtype=object)
    
    return result

# Example usage
#input_list = ['sequ', '2back', '2back']
#result = count_entries(input_list)
#print(result)

logVar["int_strucLengthList2"] = logVar.apply(lambda x: Graph1(x.int_adjList, x.indexList).dfs(), axis =1)
logVar["relFrequVec1"] = logVar["int_strucLengthList2"].apply(lambda x: transform_list_of_pairs(x))
logVar["relFrequVec1"] = logVar["relFrequVec1"].apply(lambda x: count_entries(x))

In [65]:
#Cosine distance based on edge types
cos_graph_dis = matrix_calc(logVar["relFrequVec1"],cosineDist)
results(cos_graph_dis)

NN:   0.6532348111658457
P@10: 0.6224614121510673
Tri:  tensor(0.5769)
Sil:  0.09711318446931942


In [66]:
#Cosine distance based on edge types
agg_cos = cos1_dis + cos2_dis + cos_graph_dis
results(agg_cos)

NN:   0.9713628899835797
P@10: 0.942175697865353
Tri:  tensor(0.5821)
Sil:  0.0674574899731124


In [61]:
'''
listVec = list(logVar["relFrequVec2"])

n = len(listVec)
dist_matrix = np.zeros((n,n))

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = cosineDist(listVec[i], listVec[j])
        dist_matrix[j,i] = dist_matrix[i,j]  
        
cos_graph_dis = dist_matrix

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [62]:
'''
Matrix = cos_graph_dis

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.49142857142857144
P@10: 0.6080000000000001
Tri:  tensor(0.5488)
Sil:  0.06517649082551862


In [63]:
#Aggregation
'''
aggregate = cos1_dis + cos2_dis + cos_graph_dis

Matrix = aggregate

print('NN:   ' + str(NearestNeighbor(Matrix, labelDict1)))
print('P@10: ' + str(PrecisionAtK(Matrix, labelDict1, 10)))

triplet = BatchAllTtripletLoss()
print('Tri:  ' + str(triplet.forward(Matrix,labelDict1)))
print('Sil:  ' + str(Silhouette(Matrix, labelDict1)))
'''

NN:   0.9914285714285714
P@10: 0.9560000000000001
Tri:  tensor(0.6395)
Sil:  0.0926845569582354


### Jaccard Edge Type and length + df relation

In [67]:
class Graph2:
    # instance variables
    def __init__(self, graph_list2, indexList):
        # v is the number of nodes/vertices
        self.time = 0
        self.traversal_array = []
        self.structural_array = []
        #self.structural_array = []
        self.graph_list = graph_list2
        self.v = len(graph_list2)
        self.indexList = indexList

    # function for dfs
    def dfs(self):
        self.start_time = [-1]*self.v
        self.end_time = [-1]*self.v
 
        for node in range(self.v):
            if self.start_time[node] == -1:
                self.traverse_dfs(node)
                
        return self.structural_array
        #print()
        #print("DFS Traversal: ", self.traversal_array)
        #print()
 
    def traverse_dfs(self, node):
        self.traversal_array.append(node)
        # get the starting time
        self.start_time[node] = self.time
        self.time += 1
        # traverse through the neighbours
        for neighbour in self.graph_list[node]:

            # when the neighbor was not yet visited
            if self.start_time[neighbour] == -1:                
                #self.structural_array[0][1] += 0
                #self.structural_array.append('tree ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ')
                self.structural_array.append('tree')
                self.traverse_dfs(neighbour)
                
            # otherwise when the neighbour's visit is still ongoing:
            elif self.end_time[neighbour] == -1:
                if node == neighbour:
                    self.structural_array.append('back ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(1))
                    #self.structural_array.append([str(1)+'b'])
                
                elif node in self.graph_list[neighbour]:
                    self.structural_array.append('back ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(2))
                    #self.structural_array.append(str(2)+'b')
                    
                else:
                    x = bfs_4(self.graph_list, neighbour, node)
                    self.structural_array.append('back ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(x+1))
                    #self.structural_array.append(str(x+1)+'b')
                
            # otherwise when the neighbour's visit started before the current node's visit:
            elif self.start_time[node] < self.start_time[neighbour]:
                graph_list_copy = copy.deepcopy(self.graph_list)
                graph_list_copy[node].remove(neighbour)
                y = bfs_4(graph_list_copy, node, neighbour)
                self.structural_array.append('forward ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(y-1))
                #self.structural_array.extend((y-1)*['forward']) # -1 to exclude one edge: (A:B,C;B:C;C:[]) ...the dist A --> C is 2 without forward edge, but we are skipping only one activity
                #self.structural_array.append(str(y-1)+'f')
                
            else:
                #Possibly first check, whether two nodes connected by cross-type have identical parent
                numberSkips, numberCross = common_ancestors(self.graph_list, node, neighbour)
                #self.structural_array.append('forward ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(numberSkips))
                self.structural_array.append('cross ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(numberCross))
                #self.structural_array.append(str(numberSkips)+'f')
                #self.structural_array.append(str(numberCross)+'c')
    
        # Indentation corrected:
        self.end_time[node] = self.time
        self.time += 1

In [68]:
logVar["int_strucLengthList3"] = logVar.apply(lambda x: Graph2(x.int_adjList, x.indexList).dfs(), axis =1)
#logVar

In [69]:
#Jacc similarity based on edge types

jacc_graph = matrix_calc(logVar["int_strucLengthList3"],jaccard_similarity)
results(jacc_graph)

NN:   0.9799671592775041
P@10: 0.962216748768473
Tri:  tensor(0.6632)
Sil:  0.04593634271628139


In [70]:
#Jacc sim based on edge types
agg_jacc = Jacc1_dis + Jacc2_dis + jacc_graph
results(agg_cos)

NN:   0.9713628899835797
P@10: 0.942175697865353
Tri:  tensor(0.5821)
Sil:  0.0674574899731124


## Eventually Follows

In [71]:
#Spatial distance between strings


from scipy.spatial import distance


def distanceSpatial(traceString, char1, char2):
    positions_letter1 = [pos for pos, char in enumerate(traceString) if char == char1]
    positions_letter2 = [pos for pos, char in enumerate(traceString) if char == char2]
    
    distList = []
    

    for i in range(len(positions_letter1)):
        for j in range(len(positions_letter2)):
            dist = positions_letter2[j] - positions_letter1[i]
            if dist > 0:
                    #print(dist)
                distList.append(dist)
                    
    
    if not distList: #distList.append(0) #in the case the char1 is after char2 asign dist 0, i.e. char2 cannot be reached from char1
        return 0
    else:
        return 1/min(distList)





def commonDistance(trace1, trace2):
    
    commonSet = set(trace1) & set(trace2)

    commonList = list(commonSet)
    commonList.sort()
    #print(commonList)

    n = len(commonSet)
    dist_matrix1 = np.zeros((n,n))
    dist_matrix2 = np.zeros((n,n))

    for i in range(n):
        for j in range(i, n):
            dist_matrix1[i,j] = distanceSpatial(trace1, commonList[i], commonList[j])
        
    for i in range(n):
        for j in range(i, n):
            dist_matrix2[i,j] = distanceSpatial(trace2, commonList[i], commonList[j])
    
    #print(dist_matrix1, dist_matrix2)
    return distance.cosine(dist_matrix1.ravel(), dist_matrix2.ravel())



#x = 'ABCDEF'
#y = 'ABCDEBCDEBCDEF'
#z = 'ABCDEBCDEF'
#print(dist_matrix)
#distanceSpatial(x, 'A', 'E')
#listVec = logVar["strings"]
#x= listVec[0]
#y= listVec[1]
#commonDistance(x, y)

In [73]:
dist_matrix_evFollows = matrix_calc(logVar["strings"],commonDistance)
agg_evFollows = 0.7*dist_matrix_evFollows + 0.3*cos1_dis 
results(agg_evFollows)

KeyboardInterrupt: 

## Maximal Repeat

In [None]:
from suffix_tree import Tree

In [None]:
#tree.maximal_repeats
def maxRepeat(tree):
    mrList=[]
    for C, path in sorted(tree.maximal_repeats()):
        mrList.append(str(path))
    return mrList

#test_tree = Tree({"A": "aaacdcdcbedbccbadbdebdc"})
#maxRepeat(test_tree)

In [None]:
#create vector based on maximal repeats
logVar["mrList"] = logVar["strings"].apply(lambda x: maxRepeat(Tree({"A": x})))
logVar["mrVector"] = logVar["mrList"].apply(lambda x: createVector(tuple(x)))

In [None]:
#Cosine distance based on maxR

cos_mr = matrix_calc(logVar["mrVector"],cosineDist)
results(cos_mr)

In [None]:
#Euclidean distance based on maxR

euc_mr = matrix_calc(logVar["mrVector"],euclidDist)
results(euc_mr)

In [None]:
#Jaccard similarity based on maxR

jacc_mr = matrix_calc(logVar["mrList"],jaccard_similarity)
results(jacc_mr)

## Optimal Alignments

In [None]:
from Bio import pairwise2
import math


# Define sequences
#seq1 = "ACCGTTTTT"
#seq2 = "ACG"

# Perform global alignment with scoring:
# match = +2, mismatch = -1, gap open = -2, gap extend = 0
#alignments = pairwise2.align.globalms(seq1, seq2, 2, -1, -2, 0)

#alignments
#score --> alignments[0][2]

def optAlign1(string1, string2):
    alignments = pairwise2.align.globalms(string1, string2, 1, 0, 0, 0)    
    return 1 - (alignments[0][2]/max(len(string1),len(string2)))


def optAlign2(string1, string2):
    alignments = pairwise2.align.globalms(string1, string2, 2, -1, -2, 0)    
    return 1 / (1 + math.exp(0.05*alignments[0][2]))

In [None]:
Align_dis1 = matrix_calc(logVar["strings"],optAlign1)
results(Align_dis1)

In [None]:
Align_dis2 = matrix_calc(logVar["strings"],optAlign2)
results(Align_dis2)