In [1]:
import pandas as pd
from datetime import datetime
import pm4py
from sklearn.preprocessing import LabelEncoder
import numpy as np
from scipy.stats import mannwhitneyu
import collections
import copy
from collections import defaultdict
from Levenshtein import distance

In [2]:
d={'trace_variant': 
  [['A','B','C','D'],
  ['C','D','A','B'],
  ['A','X','Y','D'],
  ['C','A','D','B'],
  ['A','B','B','C','C','D'],
  ['A','B','C','D','A','D'],
  ['A','B','D','A','C','D']]} 
logVar = pd.DataFrame(data=d)

In [3]:
logVar

Unnamed: 0,trace_variant
0,"[A, B, C, D]"
1,"[C, D, A, B]"
2,"[A, X, Y, D]"
3,"[C, A, D, B]"
4,"[A, B, B, C, C, D]"
5,"[A, B, C, D, A, D]"
6,"[A, B, D, A, C, D]"


In [4]:
#Change data format from tuple to string
logVar["strings"] = logVar["trace_variant"].apply(lambda x: ''.join(x))
logVar

Unnamed: 0,trace_variant,strings
0,"[A, B, C, D]",ABCD
1,"[C, D, A, B]",CDAB
2,"[A, X, Y, D]",AXYD
3,"[C, A, D, B]",CADB
4,"[A, B, B, C, C, D]",ABBCCD
5,"[A, B, C, D, A, D]",ABCDAD
6,"[A, B, D, A, C, D]",ABDACD


## Levenshtein Distance

In [5]:
#from Levenshtein import distance

List = logVar["strings"]

n = len(List)
dist_matrix = np.zeros((n,n))    # initialize distance matrix to a square of zeros

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = distance(List[i], List[j])
        dist_matrix[j,i] = dist_matrix[i,j]       # for the symmetric part, no computation
        

lev_dis = dist_matrix[0:1]
lev_dis

array([[0., 4., 2., 4., 2., 2., 2.]])

## Normalized Levenshtein Distance 

In [6]:
List = logVar["strings"]

n = len(List)
dist_matrix = np.zeros((n,n))    # initialize distance matrix to a square of zeros

for i in range(n):
    for j in range(i, n):
        dist_matrix[i,j] = distance(List[i], List[j]) / max(len(List[i]),len(List[j]))
        dist_matrix[j,i] = dist_matrix[i,j]       # for the symmetric part, no computation

lev_dis = dist_matrix[0:1]
lev_dis

array([[0.        , 1.        , 0.5       , 1.        , 0.33333333,
        0.33333333, 0.33333333]])

## Cosine Similarity

In [7]:
#Create 1-gram

def createVector(charList):
    #dtype = [('structure', 'S10'), ('relfrequ', float)]
    arrayList = np.array(charList)
    unique, counts = np.unique(arrayList, return_counts=True)
    #calculate relative frequency
    relFrequList = np.array((unique, counts)).T
    uniqueList = list(unique)
    return relFrequList[relFrequList[:, 0].argsort()]
    #check completeness
    #if 'tree' not in uniqueList:
        #relFrequList = np.append(relFrequList, np.array([['tree', 0]]), axis=0)
        #print(relFrequList)

        
#Change data format from string to list of unique characters
logVar["1-gram"] = logVar["trace_variant"].apply(lambda x: createVector(x))
logVar

Unnamed: 0,trace_variant,strings,1-gram
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]"


In [8]:
def alignArrays(array1, array2):
    commonSet = set(array1[:,0]).union(array2[:,0])
    #print(commonSet)
    
    for i in commonSet:
        if i not in array1[:,0]:
            array1 = np.append(array1, np.array([[i, '0']]), axis=0)
        if i not in array2[:,0]:
            array2 = np.append(array2, np.array([[i, '0']]), axis=0)
    return array1[array1[:, 0].argsort()], array2[array2[:, 0].argsort()]


#v = np.array([['*A', '1'],
# ['AB', '1'],
# ['BC', '1'],
# ['CD', '1'],
# ['DE', '1'],
# ['EF', '1'],
# ['F$', '1']])
#w = np.array([['*A', '1'],
# ['AB', '1'],
# ['BC', '2'],
# ['CD', '2'],
# ['DE', '2'],
# ['EB', '1'],
# ['EF', '1'],
# ['F$', '1']])
#alignArrays(v, w)
#

In [9]:
#Create 2-gram

#Change data format from string to list of unique characters
logVar["charList"] = logVar["trace_variant"].apply(lambda x: list(x))

def df_list(list_of_char):
    extList = list_of_char.copy()
    extList.insert(0, '*') 
    extList.append('$')
    list_new = []
    for i in range(len(extList)):
        new = ''.join(extList[i:i+2])
        list_new.append(new)
    del list_new[-1]
    return list_new


In [10]:
#Change data format from string to list of unique characters
logVar["dfList"] = logVar["charList"].apply(lambda x: df_list(x))
logVar["2-gram"] = logVar["dfList"].apply(lambda x: createVector(x))
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ..."
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ..."
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ..."


In [11]:
#Create 3-gram

#Change data format from string to list of unique characters
logVar["charList"] = logVar["trace_variant"].apply(lambda x: list(x))

def df_list2(list_of_char):
    extList = list_of_char.copy()
    extList.insert(0, '*') 
    extList.append('$')
    list_new = []
    for i in range(len(extList) - 1):
        new = ''.join(extList[i:i+3])
        list_new.append(new)
    del list_new[-1]
    return list_new


In [12]:
logVar["dfList2"] = logVar["charList"].apply(lambda x: df_list2(x))
logVar["3-gram"] = logVar["dfList2"].apply(lambda x: createVector(x))
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,..."
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,..."
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,..."


# New

In [13]:
from scipy.spatial import distance

def cosineDist(frequVector1, frequVector2):
    #Vector1, Vector2 = alignArrays_OneHotEmbedding(frequVector1, frequVector2) ################OLD
    Vector1, Vector2 = alignArrays(frequVector1, frequVector2)
    a = Vector1[:,1].astype(int)
    b = Vector2[:,1].astype(int)
    cosine_dist = distance.cosine(a, b)
    return cosine_dist


#cosineDist(v,w)

In [14]:
def matrix_calc(features, distance):
    n = len(features)
    dist_matrix = np.zeros((n,n))
    
    for i in range(n):
        for j in range(i, n):
            dist_matrix[i,j] = distance(features[i], features[j])
            dist_matrix[j,i] = dist_matrix[i,j]
            
    return dist_matrix[0:1]

In [15]:
#Cosine distance based on 1-gram

cos1_dis = matrix_calc(logVar["1-gram"],cosineDist)
cos1_dis

array([[0.       , 0.       , 0.5      , 0.       , 0.0513167, 0.0513167,
        0.0513167]])

In [16]:
#Cosine distance based on 2-gram

cos2_dis = matrix_calc(logVar["2-gram"],cosineDist)
cos2_dis

array([[0.        , 0.6       , 0.6       , 1.        , 0.15484575,
        0.15484575, 0.3238766 ]])

In [17]:
#Cosine distance based on 3-gram

matrix_calc(logVar["3-gram"],cosineDist)

array([[0.        , 1.        , 1.        , 1.        , 0.59175171,
        0.38762756, 0.59175171]])

In [18]:
#Aggregated Cosine Distance
cos1_dis + cos2_dis

array([[0.        , 0.6       , 1.1       , 1.        , 0.20616245,
        0.20616245, 0.3751933 ]])

# NEW

## Euclidean Distance

In [19]:
# Euclidean distance
# see https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy

def euclidDist(frequVector1, frequVector2):
    Vector1, Vector2 = alignArrays(frequVector1, frequVector2)
    a = Vector1[:,1].astype(float)
    b = Vector2[:,1].astype(float)
    euclidean_dist = np.linalg.norm(a-b)
    return euclidean_dist

In [20]:
#Euclidean distance based on 1-gram

euc1_dis = matrix_calc(logVar["1-gram"],euclidDist)
euc1_dis

array([[0.        , 0.        , 2.        , 0.        , 1.41421356,
        1.41421356, 1.41421356]])

In [21]:
#Euclidean distance based on 2-gram

euc2_dis = matrix_calc(logVar["2-gram"],euclidDist)
euc2_dis

array([[0.        , 2.44948974, 2.44948974, 3.16227766, 1.41421356,
        1.41421356, 2.        ]])

In [22]:
#Euclidean distance based on 3-gram

euc3_dis = matrix_calc(logVar["3-gram"],euclidDist)
euc3_dis

array([[0.        , 2.82842712, 2.82842712, 2.82842712, 2.44948974,
        2.        , 2.44948974]])

## Jaccard Distance

In [23]:
def jaccard_similarity(list1, list2):
    s1, s2 = set(list1), set(list2)
    return 1 - len(s1 & s2) / len(s1 | s2)

In [24]:
#Based on 1-gram

Jacc1_dis = matrix_calc(logVar["charList"],jaccard_similarity)
Jacc1_dis

array([[0.        , 0.        , 0.66666667, 0.        , 0.        ,
        0.        , 0.        ]])

In [25]:
#Based on 2-gram

Jacc2_dis = matrix_calc(logVar["dfList"],jaccard_similarity)
Jacc2_dis

array([[0.        , 0.75      , 0.75      , 1.        , 0.28571429,
        0.28571429, 0.5       ]])

In [26]:
#Based on 3-gram

Jacc3_dis = matrix_calc(logVar["dfList2"],jaccard_similarity)
Jacc3_dis

array([[0.        , 1.        , 1.        , 1.        , 0.75      ,
        0.57142857, 0.75      ]])

In [27]:
#Aggregated Jaccard Distance

Jacc1_dis + Jacc2_dis

array([[0.        , 0.75      , 1.41666667, 1.        , 0.28571429,
        0.28571429, 0.5       ]])

## Graph based measures

In [28]:
#Now consider edge types

In [29]:
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,..."
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,..."
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,..."


# NEW Encoding

In [30]:
# 1. encode strings as integers


# Step 1: Collect all unique elements across all lists
unique_elements = set(element for sublist in logVar['charList'] for element in sublist)


# Step 2: Create a mapping from each unique element to a unique integer
mapping = {value: idx + 1 for idx, value in enumerate(unique_elements)}

           
# Step 3: Apply the mapping to each list
logVar["intList"] = logVar["charList"].apply(lambda lst: [mapping[item] for item in lst])



In [31]:
mapping

{'B': 1, 'D': 2, 'C': 3, 'X': 4, 'Y': 5, 'A': 6}

In [32]:
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]"


# NEW

In [33]:
# 2. transfer intList to int_tupleList

#Create tuple lists
def tuple_list(list_of_encodedActivities):
    #list.insert(0, '*')
    #list.append('*')
    list_new = []
    last_element = list_of_encodedActivities[-1]
    for i in range(len(list_of_encodedActivities)):
        new = tuple(list_of_encodedActivities[i:i+2])
        list_new.append(new)
    del list_new[-1]
    if list_of_encodedActivities.count(last_element) == 1: #check wether last activity in trace has some adjancency relation
        list_new.append((last_element,)) ### Indicate that las element has no neighbor
    return list_new

#q = [0,0,0,0,1,1,2,3,4,5,3,2,4,0,5,6]
#tuple_list(q)

logVar["int_tupleList"] = logVar["intList"].apply(lambda x: tuple_list(x))
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList,int_tupleList
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]","[(6, 1), (1, 3), (3, 2), (2,)]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]","[(3, 2), (2, 6), (6, 1), (1,)]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]","[(6, 4), (4, 5), (5, 2), (2,)]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]","[(3, 6), (6, 2), (2, 1), (1,)]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]","[(6, 1), (1, 1), (1, 3), (3, 3), (3, 2), (2,)]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]","[(6, 1), (1, 3), (3, 2), (2, 6), (6, 2)]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]","[(6, 1), (1, 2), (2, 6), (6, 3), (3, 2)]"


In [34]:
# 3. generate Adjacency List

def adj_list(list_of_tuples):
    adj_list_new = {}
    try:
        for node1, node2 in list_of_tuples:
            #print(node1, node2)
            if node1 not in adj_list_new:
                newlist = []
                newlist.append(node2)
                adj_list_new[node1] = newlist
                #print(adj_list3)
        
            else:
                if node2 not in adj_list_new[node1]:
                    #mylist.extend(adj_list3[node1])
                    adj_list_new[node1].append(node2)
                    #print(adj_list3)
                    #adj_list3[node1] = mylist
    
    #in case activity has no adjacent activity - only possible for last activity --> tuple: (lastAct,)
    except ValueError as ve:
        lastValue = list_of_tuples[-1][0] 
        adj_list_new[lastValue] = list()
    return list(adj_list_new.values())

#q = [0,0,0,0,1,1,2,3,4,5,3,2,4,0,5,6]
#l = tuple_list(q)
#adj_list(l)

logVar["int_adjList"] = logVar["int_tupleList"].apply(lambda x: adj_list(x))
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList,int_tupleList,int_adjList
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]","[(6, 1), (1, 3), (3, 2), (2,)]","[[1], [3], [2], []]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]","[(3, 2), (2, 6), (6, 1), (1,)]","[[2], [6], [1], []]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]","[(6, 4), (4, 5), (5, 2), (2,)]","[[4], [5], [2], []]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]","[(3, 6), (6, 2), (2, 1), (1,)]","[[6], [2], [1], []]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]","[(6, 1), (1, 1), (1, 3), (3, 3), (3, 2), (2,)]","[[1], [1, 3], [3, 2], []]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]","[(6, 1), (1, 3), (3, 2), (2, 6), (6, 2)]","[[1, 2], [3], [2], [6]]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]","[(6, 1), (1, 2), (2, 6), (6, 3), (3, 2)]","[[1, 3], [2], [6], [2]]"


# NEW

In [35]:
#Derive minimum distance between nodes for loops and skips

from collections import deque

def bfs_4(adjacency_list, start, target):
    if start == target:
        return 0

    visited = [False] * len(adjacency_list)
    queue = deque([(start, 0)])  # (current_node, current_distance)

    while queue:
        current, distance = queue.popleft()
        if visited[current]:
            continue
        visited[current] = True

        for neighbor in adjacency_list[current]:
            if neighbor == target:
                return distance + 1
            if not visited[neighbor]:
                queue.append((neighbor, distance + 1))

    return float('nan')  # Return -1 if target is not reachable

# Example usage
#adj_list = [[1], [2], [3], []]
#start_node = 0
#target_node = 3
#print(bfs_4(adj_list, start_node, target_node))


In [36]:
#Derive minimum distance between nodes for choice

from collections import defaultdict, deque
import copy

def reverse_graph(graph):
    reversed_graph = defaultdict(list)
    for node in graph:
        for neighbor in graph[node]:
            reversed_graph[neighbor].append(node)
    return reversed_graph


def bfs_5(graph, start, end):
    queue = deque([(start, 0)])
    seen = set()
    visited = {}
    while queue:
        node, distance = queue.popleft()
        if node in seen:
            continue
        seen.add(node)
        if node == end: # maybe quicker if adjacent directly checked
            return visited
        for adjacent in graph.get(node, []):
            queue.append((adjacent, distance + 1))
            if adjacent not in visited:
                visited.update({adjacent:distance})

            
def common_ancestors(adj_graph, node1, node2): 
    #remove choice type edge between node1 and node2
    
    ##########################
    # CHANGE
    ##########################
    graph = copy.deepcopy(adj_graph)
    graph[node1].remove(node2) #maybe issue, if graph is changed and not copy of graph?
    graph = {v: k for v, k in enumerate(graph)}
    graphReverse = reverse_graph(graph)
    setNode1 = bfs_5(graphReverse, node1, 0)
    setNode2 = bfs_5(graphReverse, node2, 0)
    if next((a for a in list(setNode1) if a in list(setNode2)), None) == None:
        firstCommonAnces = next((a for a in list(setNode2) if a in list(setNode1)), None)
    else:
        firstCommonAnces = next((a for a in list(setNode1) if a in list(setNode2)), 0)
    
    #uses a hash map to identify the first common ancestor in both lists
    #looks for the first common ancestor in setNode1, which can also be found in setNode2 
    #--> this might not be the closest distance between setNode1 and setNode2
    #--> e.g., for x= [0,1,3,7,5,6] and y= [4,5,7,8,3] 7 might be closest ancestor, although algo detects 3 !
    #distance = setNode1[firstCommonAnces] + setNode2[firstCommonAnces]
    
    
    if firstCommonAnces != None:   
        ancesDistNode1 =  setNode1[firstCommonAnces] + 1 #the edge from node1 to first parent is counted as 0 by algorithm, therefore +1
        ancesDistNode2 =  setNode2[firstCommonAnces] + 1
        numberSkips = abs(ancesDistNode1 - ancesDistNode2)
        numberCross = min(ancesDistNode1, ancesDistNode2)
    else:
        numberSkips, numberCross = (0,1)
    return numberSkips, numberCross
    #if all(x in crossType for x in i):
    

    

#graphList = [[1], [2, 4, 1], [3, 2, 1], [], [5, 4], [5, 4, 6], [7], []]
#c = [[1, 4], [2], [3], [0, 5], [3, 5], []]
#c2 = {v: k for v, k in enumerate(c)}
#common_ancestors(c, 4, 5)
#reverse_graph(c2)

In [37]:
#Create List for decoding traces
from collections import OrderedDict
logVar["indexList"] = logVar["intList"].apply(lambda x: list(OrderedDict.fromkeys(x)))

In [38]:
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList,int_tupleList,int_adjList,indexList
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]","[(6, 1), (1, 3), (3, 2), (2,)]","[[1], [3], [2], []]","[6, 1, 3, 2]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]","[(3, 2), (2, 6), (6, 1), (1,)]","[[2], [6], [1], []]","[3, 2, 6, 1]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]","[(6, 4), (4, 5), (5, 2), (2,)]","[[4], [5], [2], []]","[6, 4, 5, 2]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]","[(3, 6), (6, 2), (2, 1), (1,)]","[[6], [2], [1], []]","[3, 6, 2, 1]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]","[(6, 1), (1, 1), (1, 3), (3, 3), (3, 2), (2,)]","[[1], [1, 3], [3, 2], []]","[6, 1, 3, 2]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]","[(6, 1), (1, 3), (3, 2), (2, 6), (6, 2)]","[[1, 2], [3], [2], [6]]","[6, 1, 3, 2]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]","[(6, 1), (1, 2), (2, 6), (6, 3), (3, 2)]","[[1, 3], [2], [6], [2]]","[6, 1, 2, 3]"


### Cosine Edge Type + length (no df relations)

In [39]:
class Graph1:
    # instance variables
    def __init__(self, graph_list2, indexList):
        # v is the number of nodes/vertices
        self.time = 0
        self.traversal_array = []
        self.structural_array = [['sequ', 1]]
        #self.structural_array = []
        self.graph_list = graph_list2
        self.v = len(graph_list2)
        self.indexList = indexList

    # depth-first search
    def dfs(self):
        self.start_time = [-1]*self.v
        self.end_time = [-1]*self.v
 
        for node in range(self.v):
            if self.start_time[node] == -1:
                self.traverse_dfs(node)
                
        return np.array(self.structural_array)
        #print("DFS Traversal: ", self.traversal_array)

 
    def traverse_dfs(self, node):
        self.traversal_array.append(node)
        # get the starting time
        self.start_time[node] = self.time
        self.time += 1
        # traverse through the neighbours
        for neighbour in self.graph_list[node]:

            # when the neighbor was not yet visited
            if self.start_time[neighbour] == -1:                
                self.structural_array[0][1] += 0
                self.traverse_dfs(neighbour)
                
            # otherwise when the neighbour's visit is still ongoing:
            elif self.end_time[neighbour] == -1:
                #if node == neighbour:
                #    self.structural_array.append(['1back ',1])

                
                #elif node in self.graph_list[neighbour]:
                #    self.structural_array.append(['2back ',2])

                    
                #else:
                x = bfs_4(self.graph_list, neighbour, node)
                self.structural_array.append([str(x+1)+'back ',x+1]) # x=1 is considered as self-loop that's why x+1

            # otherwise when the neighbour's visit started before the current node's visit:
            elif self.start_time[node] < self.start_time[neighbour]:
                graph_list_copy = copy.deepcopy(self.graph_list)
                graph_list_copy[node].remove(neighbour)
                y = bfs_4(graph_list_copy, node, neighbour)
                self.structural_array.append([str(y-1)+'forward ',y-1]) # -1 to exclude one edge: (A:B,C;B:C;C:[]) ...the dist A --> C is 2 without forward edge, but we are skipping only one activity

                
            else:
                numberSkips, numberCross = common_ancestors(self.graph_list, node, neighbour)
                self.structural_array.append([str(numberCross)+'cross ',numberCross])

    
        # Indentation corrected:
        self.end_time[node] = self.time
        self.time += 1



In [40]:
#Graph1(logVar['int_adjList'][0],logVar['indexList'][0]).dfs()

In [41]:
# Create new encoding

def intEncoder(character_List):
    return [np.where(np.array(list(dict.fromkeys(character_List)))==e)[0][0]for e in character_List]

logVar["intList2"] = logVar["charList"].apply(lambda x: intEncoder(x))

In [42]:
logVar["int_tupleList2"] = logVar["intList2"].apply(lambda x: tuple_list(x))
logVar["int_adjList2"] = logVar["int_tupleList2"].apply(lambda x: adj_list(x))

# NEW

In [43]:
from collections import Counter

def transform_list_of_pairs(pairs):
    return [pair[0] for pair in pairs]



def count_entries(input_list):
    # Count the occurrences of each unique entry in the list
    counter = Counter(input_list)
    
    # Create a NumPy array from the counter dictionary
    result = np.array([[key, count] for key, count in counter.items()], dtype=object)
    
    return result

# Example usage
#input_list = ['sequ', '2back', '2back']
#result = count_entries(input_list)
#print(result)

logVar["int_strucLengthList2"] = logVar.apply(lambda x: Graph1(x.int_adjList2, x.indexList).dfs(), axis =1)
logVar["relFrequVec1"] = logVar["int_strucLengthList2"].apply(lambda x: transform_list_of_pairs(x))
logVar["relFrequVec1"] = logVar["relFrequVec1"].apply(lambda x: count_entries(x))

In [44]:
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList,int_tupleList,int_adjList,indexList,intList2,int_tupleList2,int_adjList2,int_strucLengthList2,relFrequVec1
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]","[(6, 1), (1, 3), (3, 2), (2,)]","[[1], [3], [2], []]","[6, 1, 3, 2]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]","[(3, 2), (2, 6), (6, 1), (1,)]","[[2], [6], [1], []]","[3, 2, 6, 1]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]","[(6, 4), (4, 5), (5, 2), (2,)]","[[4], [5], [2], []]","[6, 4, 5, 2]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]","[(3, 6), (6, 2), (2, 1), (1,)]","[[6], [2], [1], []]","[3, 6, 2, 1]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]","[(6, 1), (1, 1), (1, 3), (3, 3), (3, 2), (2,)]","[[1], [1, 3], [3, 2], []]","[6, 1, 3, 2]","[0, 1, 1, 2, 2, 3]","[(0, 1), (1, 1), (1, 2), (2, 2), (2, 3), (3,)]","[[1], [1, 2], [2, 3], []]","[[sequ, 1], [1back , 1], [1back , 1]]","[[sequ, 1], [1back , 2]]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]","[(6, 1), (1, 3), (3, 2), (2, 6), (6, 2)]","[[1, 2], [3], [2], [6]]","[6, 1, 3, 2]","[0, 1, 2, 3, 0, 3]","[(0, 1), (1, 2), (2, 3), (3, 0), (0, 3)]","[[1, 3], [2], [3], [0]]","[[sequ, 1], [2back , 2], [2forward , 2]]","[[sequ, 1], [2back , 1], [2forward , 1]]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]","[(6, 1), (1, 2), (2, 6), (6, 3), (3, 2)]","[[1, 3], [2], [6], [2]]","[6, 1, 2, 3]","[0, 1, 2, 0, 3, 2]","[(0, 1), (1, 2), (2, 0), (0, 3), (3, 2)]","[[1, 3], [2], [0], [2]]","[[sequ, 1], [3back , 3], [1cross , 1]]","[[sequ, 1], [3back , 1], [1cross , 1]]"


In [45]:
#Cosine distance based on edge types

cos_graph = matrix_calc(logVar["relFrequVec1"],cosineDist)
cos_graph

array([[0.        , 0.        , 0.        , 0.        , 0.5527864 ,
        0.42264973, 0.42264973]])

In [46]:
#Aggregated Cosine Distance
cos1_dis + cos2_dis + cos_graph

array([[0.        , 0.6       , 1.1       , 1.        , 0.75894885,
        0.62881218, 0.79784303]])

### Jaccard Edge Type and length + df relation

In [47]:
#NEW

class Graph2:
    # instance variables
    def __init__(self, graph_list2, indexList):
        # v is the number of nodes/vertices
        self.time = 0
        self.traversal_array = []
        self.structural_array = []
        self.graph_list = graph_list2
        self.v = len(graph_list2)
        self.indexList = indexList

    # depth-first search
    def dfs(self):
        self.start_time = [-1]*self.v
        self.end_time = [-1]*self.v
 
        for node in range(self.v):
            if self.start_time[node] == -1:
                self.traverse_dfs(node)
                
        return self.structural_array
        #print("DFS Traversal: ", self.traversal_array)

 
    def traverse_dfs(self, node):
        self.traversal_array.append(node)
        # get the starting time
        self.start_time[node] = self.time
        self.time += 1
        # traverse through the neighbours
        for neighbour in self.graph_list[node]:

            # when the neighbor was not yet visited
            if self.start_time[neighbour] == -1:                
                #self.structural_array[0][1] += 0
                self.structural_array.append('tree')
                self.traverse_dfs(neighbour)
                
            # otherwise when the neighbour's visit is still ongoing:
            elif self.end_time[neighbour] == -1:
                #if node == neighbour:
                #    self.structural_array.append(['1back ',1])

                
                #elif node in self.graph_list[neighbour]:
                #    self.structural_array.append(['2back ',2])

                    
                #else:
                x = bfs_4(self.graph_list, neighbour, node)
                self.structural_array.append('back ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(x+1)) # x=1 is considered as self-loop that's why x+1

            # otherwise when the neighbour's visit started before the current node's visit:
            elif self.start_time[node] < self.start_time[neighbour]:
                graph_list_copy = copy.deepcopy(self.graph_list)
                graph_list_copy[node].remove(neighbour)
                y = bfs_4(graph_list_copy, node, neighbour)
                self.structural_array.append('forward ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(y-1)) # -1 to exclude one edge: (A:B,C;B:C;C:[]) ...the dist A --> C is 2 without forward edge, but we are skipping only one activity

                
            else:
                numberSkips, numberCross = common_ancestors(self.graph_list, node, neighbour)
                self.structural_array.append('cross ' + str(self.indexList[node]) + ' ' + str(self.indexList[neighbour]) + ' ' + str(numberCross))

    
        # Indentation corrected:
        self.end_time[node] = self.time
        self.time += 1



In [48]:
logVar["int_strucLengthList3"] = logVar.apply(lambda x: Graph2(x.int_adjList2, x.indexList).dfs(), axis =1)
#logVar["relFrequVec3"] = logVar["int_strucLengthList3"].apply(lambda x: createVector(x))
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList,int_tupleList,int_adjList,indexList,intList2,int_tupleList2,int_adjList2,int_strucLengthList2,relFrequVec1,int_strucLengthList3
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]","[(6, 1), (1, 3), (3, 2), (2,)]","[[1], [3], [2], []]","[6, 1, 3, 2]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]"
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]","[(3, 2), (2, 6), (6, 1), (1,)]","[[2], [6], [1], []]","[3, 2, 6, 1]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]"
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]","[(6, 4), (4, 5), (5, 2), (2,)]","[[4], [5], [2], []]","[6, 4, 5, 2]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]"
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]","[(3, 6), (6, 2), (2, 1), (1,)]","[[6], [2], [1], []]","[3, 6, 2, 1]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]"
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]","[(6, 1), (1, 1), (1, 3), (3, 3), (3, 2), (2,)]","[[1], [1, 3], [3, 2], []]","[6, 1, 3, 2]","[0, 1, 1, 2, 2, 3]","[(0, 1), (1, 1), (1, 2), (2, 2), (2, 3), (3,)]","[[1], [1, 2], [2, 3], []]","[[sequ, 1], [1back , 1], [1back , 1]]","[[sequ, 1], [1back , 2]]","[tree, back 1 1 1, tree, back 3 3 1, tree]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]","[(6, 1), (1, 3), (3, 2), (2, 6), (6, 2)]","[[1, 2], [3], [2], [6]]","[6, 1, 3, 2]","[0, 1, 2, 3, 0, 3]","[(0, 1), (1, 2), (2, 3), (3, 0), (0, 3)]","[[1, 3], [2], [3], [0]]","[[sequ, 1], [2back , 2], [2forward , 2]]","[[sequ, 1], [2back , 1], [2forward , 1]]","[tree, tree, tree, back 2 6 2, forward 6 2 2]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]","[(6, 1), (1, 2), (2, 6), (6, 3), (3, 2)]","[[1, 3], [2], [6], [2]]","[6, 1, 2, 3]","[0, 1, 2, 0, 3, 2]","[(0, 1), (1, 2), (2, 0), (0, 3), (3, 2)]","[[1, 3], [2], [0], [2]]","[[sequ, 1], [3back , 3], [1cross , 1]]","[[sequ, 1], [3back , 1], [1cross , 1]]","[tree, tree, back 2 6 3, tree, cross 3 2 1]"


In [49]:
#Jacc similarity based on edge types

jacc_graph = matrix_calc(logVar["int_strucLengthList3"],jaccard_similarity)
jacc_graph

array([[0.        , 0.        , 0.        , 0.        , 0.66666667,
        0.66666667, 0.66666667]])

In [50]:
Jacc1_dis + Jacc2_dis + jacc_graph

array([[0.        , 0.75      , 1.41666667, 1.        , 0.95238095,
        0.95238095, 1.16666667]])

## Eventually Follows Distance

In [51]:
#Spatial distance between strings


from scipy.spatial import distance


def distanceSpatial(traceString, char1, char2):
    positions_letter1 = [pos for pos, char in enumerate(traceString) if char == char1]
    positions_letter2 = [pos for pos, char in enumerate(traceString) if char == char2]
    
    distList = []
    

    for i in range(len(positions_letter1)):
        for j in range(len(positions_letter2)):
            dist = positions_letter2[j] - positions_letter1[i]
            if dist > 0:
                    #print(dist)
                distList.append(dist)
                    
    
    if not distList: #distList.append(0) #in the case the char1 is after char2 asign dist 0, i.e. char2 cannot be reached from char1
        return 0
    else:
        return 1/min(distList)





def commonDistance(trace1, trace2):
    
    commonSet = set(trace1) & set(trace2)

    commonList = list(commonSet)
    commonList.sort()
    #print(commonList)

    n = len(commonSet)
    dist_matrix1 = np.zeros((n,n))
    dist_matrix2 = np.zeros((n,n))

    for i in range(n):
        for j in range(i, n):
            dist_matrix1[i,j] = distanceSpatial(trace1, commonList[i], commonList[j])
        
    for i in range(n):
        for j in range(i, n):
            dist_matrix2[i,j] = distanceSpatial(trace2, commonList[i], commonList[j])
    
    #print(dist_matrix1, dist_matrix2)
    return distance.cosine(dist_matrix1.ravel(), dist_matrix2.ravel())



#x = 'ABCDEF'
#y = 'ABCDEBCDEBCDEF'
#z = 'ABCDEBCDEF'
#print(dist_matrix)
#distanceSpatial(x, 'A', 'E')
#listVec = logVar["strings"]
#x= listVec[0]
#y= listVec[1]
#commonDistance(x, y)

In [52]:
#Calculate cosine similarity based on Spatial distance

listVec = logVar["strings"]

n = len(listVec)
dist_matrix = np.zeros((n,n))    # initialize distance matrix to a square of zeros

for i in range(n):
    for j in range(i, n):
        #print(listVec[i], listVec[j])
        dist_matrix[i,j] = commonDistance(listVec[i], listVec[j])
        #print(dist_matrix[i,j])
        dist_matrix[j,i] = dist_matrix[i,j]       # for the symmetric part, no computation
        
dist_matrix_evFollow = dist_matrix[0:1]

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [53]:
0.7*dist_matrix_evFollow + 0.3*cos1_dis

array([[0.        , 0.17905411, 0.15      , 0.29897567, 0.16942104,
        0.07171677, 0.11317534]])

## Optimal Alignments

In [54]:
from Bio import pairwise2
import math


# Define sequences
#seq1 = "ACCGTTTTT"
#seq2 = "ACG"

# Perform global alignment with scoring:
# match = +2, mismatch = -1, gap open = -2, gap extend = 0
#alignments = pairwise2.align.globalms(seq1, seq2, 2, -1, -2, 0)

#alignments
#score --> alignments[0][2]

def optAlign1(string1, string2):
    alignments = pairwise2.align.globalms(string1, string2, 1, 0, 0, 0)    
    return 1 - (alignments[0][2]/max(len(string1),len(string2)))


def optAlign2(string1, string2):
    alignments = pairwise2.align.globalms(string1, string2, 2, -1, -2, 0)    
    return 1 / (1 + math.exp(0.05*alignments[0][2]))


In [55]:
Align_dis1 = matrix_calc(logVar["strings"],optAlign1)
Align_dis1

array([[0.        , 0.5       , 0.5       , 0.5       , 0.33333333,
        0.33333333, 0.33333333]])

In [56]:
Align_dis2 = matrix_calc(logVar["strings"],optAlign2)
Align_dis2

array([[0.40131234, 0.5       , 0.47502081, 0.52497919, 0.42555748,
        0.42555748, 0.42555748]])

## Maximal Repeat

In [57]:
from suffix_tree import Tree

In [58]:
#tree.maximal_repeats
def maxRepeat(tree):
    mrList=[]
    for C, path in sorted(tree.maximal_repeats()):
        mrList.append(str(path))
    return mrList

test_tree = Tree({"A": "aaacdcdcbedbccbadbdebdc"})
maxRepeat(test_tree)

['a', 'a a', 'b', 'b d', 'c', 'c b', 'c d c', 'd', 'd b', 'd c', 'e']

In [59]:
#create vector based on maximal repeats
logVar["mrList"] = logVar["strings"].apply(lambda x: maxRepeat(Tree({"A": x})))
logVar["mrVector"] = logVar["mrList"].apply(lambda x: createVector(tuple(x)))

In [60]:
logVar

Unnamed: 0,trace_variant,strings,1-gram,charList,dfList,2-gram,dfList2,3-gram,intList,int_tupleList,int_adjList,indexList,intList2,int_tupleList2,int_adjList2,int_strucLengthList2,relFrequVec1,int_strucLengthList3,mrList,mrVector
0,"[A, B, C, D]",ABCD,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[A, B, C, D]","[*A, AB, BC, CD, D$]","[[*A, 1], [AB, 1], [BC, 1], [CD, 1], [D$, 1]]","[*AB, ABC, BCD, CD$]","[[*AB, 1], [ABC, 1], [BCD, 1], [CD$, 1]]","[6, 1, 3, 2]","[(6, 1), (1, 3), (3, 2), (2,)]","[[1], [3], [2], []]","[6, 1, 3, 2]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]",[],[]
1,"[C, D, A, B]",CDAB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, D, A, B]","[*C, CD, DA, AB, B$]","[[*C, 1], [AB, 1], [B$, 1], [CD, 1], [DA, 1]]","[*CD, CDA, DAB, AB$]","[[*CD, 1], [AB$, 1], [CDA, 1], [DAB, 1]]","[3, 2, 6, 1]","[(3, 2), (2, 6), (6, 1), (1,)]","[[2], [6], [1], []]","[3, 2, 6, 1]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]",[],[]
2,"[A, X, Y, D]",AXYD,"[[A, 1], [D, 1], [X, 1], [Y, 1]]","[A, X, Y, D]","[*A, AX, XY, YD, D$]","[[*A, 1], [AX, 1], [D$, 1], [XY, 1], [YD, 1]]","[*AX, AXY, XYD, YD$]","[[*AX, 1], [AXY, 1], [XYD, 1], [YD$, 1]]","[6, 4, 5, 2]","[(6, 4), (4, 5), (5, 2), (2,)]","[[4], [5], [2], []]","[6, 4, 5, 2]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]",[],[]
3,"[C, A, D, B]",CADB,"[[A, 1], [B, 1], [C, 1], [D, 1]]","[C, A, D, B]","[*C, CA, AD, DB, B$]","[[*C, 1], [AD, 1], [B$, 1], [CA, 1], [DB, 1]]","[*CA, CAD, ADB, DB$]","[[*CA, 1], [ADB, 1], [CAD, 1], [DB$, 1]]","[3, 6, 2, 1]","[(3, 6), (6, 2), (2, 1), (1,)]","[[6], [2], [1], []]","[3, 6, 2, 1]","[0, 1, 2, 3]","[(0, 1), (1, 2), (2, 3), (3,)]","[[1], [2], [3], []]","[[sequ, 1]]","[[sequ, 1]]","[tree, tree, tree]",[],[]
4,"[A, B, B, C, C, D]",ABBCCD,"[[A, 1], [B, 2], [C, 2], [D, 1]]","[A, B, B, C, C, D]","[*A, AB, BB, BC, CC, CD, D$]","[[*A, 1], [AB, 1], [BB, 1], [BC, 1], [CC, 1], ...","[*AB, ABB, BBC, BCC, CCD, CD$]","[[*AB, 1], [ABB, 1], [BBC, 1], [BCC, 1], [CCD,...","[6, 1, 1, 3, 3, 2]","[(6, 1), (1, 1), (1, 3), (3, 3), (3, 2), (2,)]","[[1], [1, 3], [3, 2], []]","[6, 1, 3, 2]","[0, 1, 1, 2, 2, 3]","[(0, 1), (1, 1), (1, 2), (2, 2), (2, 3), (3,)]","[[1], [1, 2], [2, 3], []]","[[sequ, 1], [1back , 1], [1back , 1]]","[[sequ, 1], [1back , 2]]","[tree, back 1 1 1, tree, back 3 3 1, tree]","[B, C]","[[B, 1], [C, 1]]"
5,"[A, B, C, D, A, D]",ABCDAD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, C, D, A, D]","[*A, AB, BC, CD, DA, AD, D$]","[[*A, 1], [AB, 1], [AD, 1], [BC, 1], [CD, 1], ...","[*AB, ABC, BCD, CDA, DAD, AD$]","[[*AB, 1], [ABC, 1], [AD$, 1], [BCD, 1], [CDA,...","[6, 1, 3, 2, 6, 2]","[(6, 1), (1, 3), (3, 2), (2, 6), (6, 2)]","[[1, 2], [3], [2], [6]]","[6, 1, 3, 2]","[0, 1, 2, 3, 0, 3]","[(0, 1), (1, 2), (2, 3), (3, 0), (0, 3)]","[[1, 3], [2], [3], [0]]","[[sequ, 1], [2back , 2], [2forward , 2]]","[[sequ, 1], [2back , 1], [2forward , 1]]","[tree, tree, tree, back 2 6 2, forward 6 2 2]","[A, D]","[[A, 1], [D, 1]]"
6,"[A, B, D, A, C, D]",ABDACD,"[[A, 2], [B, 1], [C, 1], [D, 2]]","[A, B, D, A, C, D]","[*A, AB, BD, DA, AC, CD, D$]","[[*A, 1], [AB, 1], [AC, 1], [BD, 1], [CD, 1], ...","[*AB, ABD, BDA, DAC, ACD, CD$]","[[*AB, 1], [ABD, 1], [ACD, 1], [BDA, 1], [CD$,...","[6, 1, 2, 6, 3, 2]","[(6, 1), (1, 2), (2, 6), (6, 3), (3, 2)]","[[1, 3], [2], [6], [2]]","[6, 1, 2, 3]","[0, 1, 2, 0, 3, 2]","[(0, 1), (1, 2), (2, 0), (0, 3), (3, 2)]","[[1, 3], [2], [0], [2]]","[[sequ, 1], [3back , 3], [1cross , 1]]","[[sequ, 1], [3back , 1], [1cross , 1]]","[tree, tree, back 2 6 3, tree, cross 3 2 1]","[A, D]","[[A, 1], [D, 1]]"


In [61]:
#Cosine distance based on maxR

cos_mr = matrix_calc(logVar["mrVector"],cosineDist)
cos_mr

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)


array([[0., 0., 0., 0., 0., 0., 0.]])

In [62]:
#Cosine distance based on maxR

euc_mr = matrix_calc(logVar["mrVector"],euclidDist)
euc_mr

array([[0.        , 0.        , 0.        , 0.        , 1.41421356,
        1.41421356, 1.41421356]])

In [65]:
#Jaccard similarity based on maxR

jacc_mr = matrix_calc(logVar["mrList"],jaccard_similarity)
jacc_mr

#all similarities are zero