In [228]:
import pandas as pd
import networkx as nx

traceRoot = "./traces/"
traceToAnalyze = 4 # EDIT THIS TO MATCH YOUR TRACE
traceNames = {
    1: "trace1_09-06-2024_03.17.29.711", 
    2: "trace2_09-06-2024_03.17.41.719", 
    3: "trace3_09-06-2024_03.19.12.999",
    4: "aSIOTmm_30-07-2024_18.39.23.577", # 0.9
    5: "aSIOTmm_30-07-2024_18.44.34.013" # 0.1
    }
contactsPath = traceRoot + traceNames[traceToAnalyze] + "/contacts_traceNoHumansNoDupsFixed.csv"
metricsPath = traceRoot + traceNames[traceToAnalyze] + "/metrics/"
cachePath = traceRoot + traceNames[traceToAnalyze] + "/cached/"
plotsPath = traceRoot + traceNames[traceToAnalyze] + "/plots/"

In [218]:
def createValidSegmentList(df):
    df['2groups'] = df[['id1', 'id2']].values.tolist()
    df['segmentlist'] = df[['tStart', 'tEnd']].apply(lambda x: [list(x)], axis=1)
    df.drop(columns=['id1', 'id2', 'tStart', 'tEnd'], inplace=True)
    return df

# From the pseudocode given by Wang et al. 
def construct_vg_graph(df):
    
    vg_graph = nx.DiGraph()

    # Iterate over rows
    for index, row in df.iterrows():
        vertices = row['2groups']
        segment_list = row['segmentlist']
        
        # Ensure vertices exist in the graph with an empty 'segments' attribute
        for vertex in vertices:
            if not vg_graph.has_node(vertex):
                vg_graph.add_node(vertex, segments=[])

        # Add directed edge with segment list
        if len(vertices) == 2:
            vg_graph.add_edge(vertices[0], vertices[1], segments=segment_list)
    
    return vg_graph

def VG_growth(graph, alpha, min_dur, min_wei, valid_groups):
    for u in graph.nodes():
        beta = set(alpha)
        beta.add(u)
        
        #
        V_beta = list(graph.predecessors(u))
        
        if V_beta:
            for v in V_beta:
                # Calculate duration of the new group
                segments = graph.edges[v, u]['segments']
                group_duration = sum(seg[1] - seg[0] for seg in segments if seg[1] - seg[0] >= min_dur)
                valid_groups.append((beta.union({v}), group_duration))
                
            # Get edges of V_beta and filter by segment conditions
            E_V_beta = graph.in_edges(V_beta, data=True)
            
            if E_V_beta:
                for vi, vj, data in list(E_V_beta):
                    segments = data['segments']
                    u_segments = graph.nodes[u]['segments']
                    if u_segments:
                        segments = adjust_segments(segments, u_segments)
                    if not satisfies_conditions(segments, min_dur, min_wei):
                        graph.remove_edge(vi, vj)
                
                # After filtering, check if there are still edges
                E_V_beta_filtered = graph.in_edges(V_beta, data=True)
                if E_V_beta_filtered:
                    conditional_graph = graph.subgraph(V_beta)
                    VG_growth(conditional_graph, beta, min_dur, min_wei, valid_groups)

def adjust_segments(segments, u_segments):
    adjusted_segments = []
    for seg in segments:
        new_seg = [max(seg[0], u_segments[0][0]), min(seg[1], u_segments[0][1])]
        if new_seg[0] <= new_seg[1]:
            adjusted_segments.append(new_seg)
    return adjusted_segments

def satisfies_conditions(segments, min_dur, min_wei):
    total_duration = sum(seg[1] - seg[0] for seg in segments)
    return total_duration >= min_dur and len(segments) >= min_wei


In [65]:
# # Test data from Wang et al.
# data = {
#     'user': ['u1']*10 + ['u2']*10 + ['u3']*10 + ['u4']*10 + ['u5']*10 + ['u6']*10,
#     't': list(range(10)) * 6,
#     'x': [
#         92.6, 93.44, 97.12, 94.52, 94.52, 95.12, 95.68, 97.72, 94.16, 90.2,
#         97.36, 94.6, 92.08, 93.8, 96.72, 97.32, 93.84, 92.92, 95.68, 95.56,
#         94.36, 95.4, 92.52, 96.04, 96.04, 97.68, 98.56, 100.16, 98.92, 96.8,
#         91.2, 90, 91.16, 92.36, 91.6, 95.28, 92.44, 97.44, 98.92, 96.64,
#         102.16, 102.16, 101.96, 99.48, 98.04, 101.36, 101.36, 100.6, 99.88, 98.84,
#         93.96, 90.48, 91.32, 91.04, 89.68, 90.64, 89.84, 86.84, 86.84, 87.28,
#     ],
#     'y': [
#         22.36, 23.12, 19.88, 20.2, 20.2, 22.84, 22.44, 26.16, 27.88, 31.72,
#         28.56, 24.68, 22.88, 25.88, 29.8, 30, 32.48, 30.52, 29.12, 29.2,
#         12.96, 13.28, 13.04, 14.56, 13.84, 12.04, 15.64, 17.04, 17.32, 17.04,
#         31.12, 31.2, 29.36, 29.32, 26.24, 26.24, 29.04, 29.04, 25.4, 25.4,
#         27.32, 27.32, 26.44, 28.64, 31.8, 28.32, 28.32, 30, 30.92, 33.48,
#         30.52, 29.6, 29.8, 29.56, 26.72, 29, 32.84, 33.44, 33.44, 33.24,
#     ],
#     'z': [
#         7.92, 7.92, 7.92, 9.06, 9.06, 9.06, 9.06, 9.06, 9.06, 9.06,
#         5.66, 5.66, 5.66, 5.66, 5.66, 5.66, 5.66, 5.66, 5.66, 5.66,
#         10.4, 10.4, 10.4, 10.4, 10.4, 10.4, 10.4, 10.4, 10.4, 10.4,
#         9.06, 9.06, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92,
#         7.92, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92, 7.92,
#         12.46, 12.46, 12.46, 12.46, 12.46, 12.46, 12.46, 12.46, 12.46, 12.46,
#     ]
# }


# df = pd.DataFrame(data)
# df


### Usage: 
- import your contacts (make sure that id1 is always the smaller id), which implicitly obeys the distance constraint
- set the min_dur and min_wei parameters
- create a valid segment list with createValidSegmentsList(sortedContacts)
- construct a vg_graph with the help of the 2groups valid segments
- let the vg_growth algorithm run
- all valid groups will be stored in the valid_groups variable

In [None]:
contacts = pd.read_csv(contactsPath, delimiter=',', names=['id1', 'id2', 'tStart', 'tEnd', "id1_start_pos_x","id1_start_pos_y","id2_start_pos_x","id2_start_pos_y","id1_end_pos_x","id1_end_pos_y","id2_end_pos_x","id2_end_pos_y"], skiprows=1)
contacts = contacts.astype({'id1': int, 'id2': int, 'tStart': int, 'tEnd': int})

# drop columns
contacts = contacts.drop(columns=["id1_start_pos_x","id1_start_pos_y","id2_start_pos_x","id2_start_pos_y","id1_end_pos_x","id1_end_pos_y","id2_end_pos_x","id2_end_pos_y"])
sortedContacts = contacts.sort_values(by=['tEnd'])
sortedContacts
# rename id1 to id2 and id2 to id1
sortedContacts.rename(columns={'id1':'id2', 'id2':'id1'}, inplace=True)
sortedContacts
# swap values from id1 and id2
sortedContacts['id1'], sortedContacts['id2'] = sortedContacts['id2'], sortedContacts['id1']
sortedContacts

min_dur = 0
min_wei = 0

valid_groups = []
G2ValidSegments = createValidSegmentList(sortedContacts)
vg_graph = construct_vg_graph(G2ValidSegments)
VG_growth(vg_graph, set(), min_dur, min_wei, valid_groups)

# 
# sorted valid groups by duration
valid_groups = sorted(valid_groups, key=lambda x: x[1], reverse=True)
# if duration is 0 then remove it from the list
valid_groups = [(group, duration) for group, duration in valid_groups if duration > 0]




In [None]:
print(f"Number of valid groups: {len(valid_groups)}")
# for group, duration in valid_groups:
#     print(f"{group}: {duration}")
# print amount of times the same group appears

# convert valid groups to a dataframe
valid_groups_df = pd.DataFrame(valid_groups, columns=['group', 'duration'])
valid_groups_df['group'] = valid_groups_df['group'].apply(lambda x: sorted(list(x)))
# get the groups that have len(group) == 2 
groups2 = valid_groups_df[valid_groups_df['group'].apply(lambda x: len(x) == 2)]
groups3 = valid_groups_df[valid_groups_df['group'].apply(lambda x: len(x) == 3)]
groups5 = valid_groups_df[valid_groups_df['group'].apply(lambda x: len(x) == 5)]
# make group to a string

groups2['group'] = groups2['group'].apply(lambda x: str(x))
groups2 = groups2.groupby('group').size().reset_index(name='count').sort_values(by=['count'], ascending=True)
groups2

print(groups5)

In [183]:
# plot a scatterplot of the group's ids with the leftmost id on the x-axis and the rightmost id on the y-axis
from plotly import express as px
from plotly import graph_objects as go

def getLeftID(string):
    return int(string.split(",")[0][1:])
def getRightID(string):
    return int(string.split(",")[1][:-1])
copy = groups2.copy()
copy["id1"] = copy["group"].apply(lambda x: getLeftID(x))
copy["id2"] = copy["group"].apply(lambda x: getRightID(x))

fig1 = px.scatter(copy, x="id1", y="id2", title="Group IDs for group size = 2", labels={"id1": "id1", "id2": "id2"})

# Update the marker properties uniformly
fig1.update_traces(marker=dict(size=8))  # Set uniform marker opacity

# Add the red line trace so it appears on top
fig1.add_shape(
    type="line",
    x0=0, y0=0,
    x1=560, y1=560,
    line=dict(color="red", width=2),
)

# fig size
fig1.update_layout(width=600, height=560)
# title in center
fig1.update_layout(title_x=0.5)
fig1.show()
fig = go.Figure(data=[go.Scatter(x=copy["id1"], y=copy["id2"], mode='markers')])
fig.write_image(plotsPath + "dummyplot.pdf") # Dummy plot needed because otherwise the real plot has a loading bar in the pdf
# fig1.write_image(plotsPath + "groupIDsForGroupSize2MinDur74520.pdf")

In [184]:
# count the number of groups where the difference in leftmost id and rightmost id is <=4
copy["diff"] = copy["id2"] - copy["id1"]
copy
# count number of groups where the diff is <=4
smalldiff = copy[copy["diff"] <= 4].shape[0]
copy
print(str(smalldiff) + " of " + str(groups2.shape[0]) + " groups have ids close to each other (" + str(round(smalldiff/groups2.shape[0]*100, 1)) + "%)")

108 of 111 groups have ids close to each other (97.3%)


In [141]:
min_dur = range(0, 259200, 3240)
min_wei = 0

contacts = pd.read_csv(contactsPath, delimiter=',', names=['id1', 'id2', 'tStart', 'tEnd', "id1_start_pos_x","id1_start_pos_y","id2_start_pos_x","id2_start_pos_y","id1_end_pos_x","id1_end_pos_y","id2_end_pos_x","id2_end_pos_y"], skiprows=1)
contacts = contacts.astype({'id1': int, 'id2': int, 'tStart': int, 'tEnd': int})

# drop columns
contacts = contacts.drop(columns=["id1_start_pos_x","id1_start_pos_y","id2_start_pos_x","id2_start_pos_y","id1_end_pos_x","id1_end_pos_y","id2_end_pos_x","id2_end_pos_y"])
sortedContacts = contacts.sort_values(by=['tEnd'])
# rename id1 to id2 and id2 to id1
sortedContacts.rename(columns={'id1':'id2', 'id2':'id1'}, inplace=True)
# swap values from id1 and id2
sortedContacts['id1'], sortedContacts['id2'] = sortedContacts['id2'], sortedContacts['id1']

dic2groups = dict()

G2ValidSegments = createValidSegmentList(sortedContacts)
vg_graph = construct_vg_graph(G2ValidSegments)
# valid_groups = []
# G2ValidSegments = createValidSegmentList(sortedContacts)
# vg_graph = construct_vg_graph(G2ValidSegments)
# VG_growth(vg_graph, set(), min_dur, min_wei, valid_groups)
for duration in min_dur:
    valid_groups = []

    VG_growth(vg_graph, set(), duration, min_wei, valid_groups)

    # sorted valid groups by duration
    valid_groups = sorted(valid_groups, key=lambda x: x[1], reverse=True)
    # if duration is 0 then remove it from the list
    valid_groups = [(group, duration) for group, duration in valid_groups if duration > 0]
    
    valid_groups_df = pd.DataFrame(valid_groups, columns=['group', 'duration'])
    valid_groups_df['group'] = valid_groups_df['group'].apply(lambda x: sorted(list(x)))
    # get the groups that have len(group) == 2 
    groups2 = valid_groups_df[valid_groups_df['group'].apply(lambda x: len(x) == 2)]

    dic2groups[duration] = groups2

    

In [142]:
df = pd.DataFrame(columns=['duration', 'numGroups'])
for time in dic2groups.keys():
    # print(str(time) + " " + str(dic2groups[time].shape[0]))
    df = pd.concat([df, pd.DataFrame([[time, dic2groups[time].shape[0]]], columns=['duration', 'numGroups'])])

df

Unnamed: 0,duration,numGroups
0,0,1764
0,3240,1138
0,6480,902
0,9720,714
0,12960,599
...,...,...
0,243000,8
0,246240,4
0,249480,4
0,252720,0


In [None]:
# plot the number of groups for each duration in a histogram
fig = px.bar(df, x='duration', y='numGroups', title="Number of groups given min_dur", labels={"duration": "min_dur", "numGroups": "Number of groups"})
# adjust the size of the plot
# title
fig.update_layout(title_x=0.5)
fig.update_layout(width=600, height=560)
fig.show()