In [5]:
#import all necessary packages
import pandas as pd
import hypernetx as hnx
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, fpmax
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib.ticker import MaxNLocator
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Step 1. Generating Transactions from Mobility Dataset

In [2]:
# Path to the mobility dataframe with movements of 100,000 individuals over 75-day business as usual period
dataset_path = 'YJMob100k/yjmob100k-dataset2.csv' 
# The original dataset has 200x200 grid (area of a grid cell = 1/2 km x1/2 km = 0.25 km^2)
# Scaling_factor 10 (200/10 x 200/10) aggregates it to a 20x20 grid (area of a grid cell = 5 km x 5 km = 25 km^2)
scaling_factor = 10
# List of minimum support thresholds considered [1%, 1.5%, 2%]
min_sups = [0.005, 0.01, 0.015] #minimum support values corresponding to 1% and 1.5% respectively
deltaT_vals = [1, 3, 7] #Sliding window lengths
details = 'emergency'
#(0, 59) for business-as-usual, and (60, 74) for emergency
start_day = 60
end_day = 74

In [3]:
def generate_set_of_transactions_by_deltaT(dataset_path, scaling_factor, deltaT, start_day, end_day):
    mob_df = pd.read_csv(dataset_path)
    print("Delta T = "+str(deltaT))
    
    #Spatial aggregation of 200x200 grid to 20x20 grid
    mob_df['grouped_x'] = (mob_df['x'] - 1) // scaling_factor + 1
    mob_df['grouped_y'] = (mob_df['y'] - 1) // scaling_factor + 1   
    mob_df = mob_df.loc[(mob_df['d'] >= start_day) & (mob_df['d'] <= end_day)]
    
    #start and end days in the dataset
    min_day = mob_df['d'].min()
    max_day = mob_df['d'].max()
    print(min_day, max_day)
    
    #sliding window of deltaT length over min_day and max_day to obtain transactions
    transactions = []
    for start_day in range(min_day, max_day - deltaT + 2):
        end_day = start_day + deltaT - 1
        window_df = mob_df[ (mob_df['d'] >= start_day) & (mob_df['d'] <= end_day) ]
        mob_df_grouped = window_df.groupby('uid')
    
        if start_day % 10 == 0:
            print(start_day)
        for uid, record in mob_df_grouped:
            transaction = frozenset(zip(record['grouped_x'], record['grouped_y']))
            transactions.append(transaction)

    filename = f"transactions_scaling_{scaling_factor}_deltaT_{deltaT}_type_{details}.txt"

    with open(filename, "w") as f:
        for transaction in transactions:
            transaction_str = "; ".join(map(str, transaction))  # Convert frozenset to a string
            f.write(transaction_str + "\n")
    
    print(f"A total of {len(transactions)} transactions saved to {filename}")
    
    return transactions, mob_df

In [4]:
for deltaT in deltaT_vals:
    transactions_per_deltaT, df = generate_set_of_transactions_by_deltaT(dataset_path, scaling_factor, 
                                                                         deltaT, start_day, end_day)
    df.to_csv("test.csv")

Delta T = 1
60 74
60
70
A total of 365247 transactions saved to transactions_scaling_10_deltaT_1_type_emergency.txt
Delta T = 3
60 74
60
70
A total of 323786 transactions saved to transactions_scaling_10_deltaT_3_type_emergency.txt
Delta T = 7
60 74
60
A total of 224901 transactions saved to transactions_scaling_10_deltaT_7_type_emergency.txt


In [5]:
def read_transactions_from_file(filename):
    transactions = []
    
    with open(filename, "r") as f:
        for line in f:
            # Strip newline and split by commas to get pairs as strings
            pairs = line.strip().split("; ")
            
            # Convert each pair from string to a tuple of integers
            transaction = frozenset(
                tuple(map(int, pair.strip("()").split(","))) for pair in pairs
            )
            transactions.append(transaction)
    
    print(f"Loaded {len(transactions)} transactions from {filename}")
    return transactions

# Step 2. Computing Frequent Itemsets from Transactions Using FPGrowth

In [6]:
# Computes and returns all frequent itemsets given min_support and threshold. min_support is the 
# minimum fraction of total transactions in which the itemset appears; Set min_support to a suitable 
# threshold (for instance, 0.05 is 5%, 0.01 is 1%, 0.005 is 0.5%, 0.001 is 0.1%). min_itemset_size is the
# minimum number of items in any itemset. By setting it to 2, we get frequent itemsets which are of size 
# at least 2 (two or more locations are visited in the itemset).
def get_frequent_itemsets(transactions, min_sup, min_itemset_size):
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)  
    frequent_itemsets = fpgrowth(df, min_support=min_sup, use_colnames=True)
    print("Total frequent itemsets: ", len(frequent_itemsets))
    frequent_itemsets_filtered = frequent_itemsets[
        frequent_itemsets['itemsets'].apply(lambda x: len(x) >= min_itemset_size)
    ]
    print("Filtered frequent itemsets: ", len(frequent_itemsets_filtered)) 
    print("minimum support, no. of frequent itemsets: ", min_sup, len(frequent_itemsets_filtered))
    return frequent_itemsets_filtered

In [7]:
def get_filtered_frequent_itemsets(frequent_itemsets, min_sup):
    frequent_itemsets_filtered = frequent_itemsets[
        frequent_itemsets['support'] >= min_sup
    ]
    print("minimum support, no. of frequent itemsets: ", min_sup, len(frequent_itemsets_filtered))
    return frequent_itemsets_filtered

In [8]:
def write_all_freq_itemsets_by_minsup_and_deltaT(transactions, deltaT, min_sups):
    print("Minimum support values:", min_sups)
    
    # Initialize a dictionary to store all itemsets by min_sup for reference or further processing, if needed.
    freq_itemsets_by_minsup = {}

    for i, min_sup in enumerate(min_sups):
        print(f"Processing min_sup={min_sup}")
        
        # Generate or filter frequent itemsets based on min_sup value
        if i == 0:
            freq_itemsets = get_frequent_itemsets(transactions, min_sup, 2)
        else:
            freq_itemsets = get_filtered_frequent_itemsets(freq_itemsets_by_minsup[min_sups[0]], min_sup)
        
        # Store the itemsets in dictionary for each min_sup value
        freq_itemsets_by_minsup[min_sup] = freq_itemsets

        # Convert frequent itemsets to a DataFrame
        freq_itemsets_df = pd.DataFrame({
            'itemsets': freq_itemsets['itemsets'],
            'support': freq_itemsets['support']
        })

        filename = f"freq_itemsets_deltaT_{deltaT}_minsup_{min_sup}_type_{details}.csv"
        print(f"Writing itemsets to {filename}")

        freq_itemsets_df.to_csv(filename, index=False)

    print(f"All frequent itemsets have been written to CSV files for {deltaT} and {min_sups}.")

In [9]:
for i in range(len(deltaT_vals)):
    deltaT = deltaT_vals[i]
    filename = f"transactions_scaling_{scaling_factor}_deltaT_{deltaT}_type_{details}.txt"
    transactions = read_transactions_from_file(filename)
    print(f"Number of transactions: {len(transactions)}")
    write_all_freq_itemsets_by_minsup_and_deltaT(transactions, deltaT, min_sups)

Loaded 365247 transactions from transactions_scaling_10_deltaT_1_type_emergency.txt
Number of transactions: 365247
Minimum support values: [0.005, 0.01, 0.015]
Processing min_sup=0.005
Total frequent itemsets:  570
Filtered frequent itemsets:  374
minimum support, no. of frequent itemsets:  0.005 374
Writing itemsets to freq_itemsets_deltaT_1_minsup_0.005_type_emergency.csv
Processing min_sup=0.01
minimum support, no. of frequent itemsets:  0.01 90
Writing itemsets to freq_itemsets_deltaT_1_minsup_0.01_type_emergency.csv
Processing min_sup=0.015
minimum support, no. of frequent itemsets:  0.015 26
Writing itemsets to freq_itemsets_deltaT_1_minsup_0.015_type_emergency.csv
All frequent itemsets have been written to CSV files for 1 and [0.005, 0.01, 0.015].
Loaded 323786 transactions from transactions_scaling_10_deltaT_3_type_emergency.txt
Number of transactions: 323786
Minimum support values: [0.005, 0.01, 0.015]
Processing min_sup=0.005
Total frequent itemsets:  4093
Filtered frequent i

In [6]:
def read_freq_itemsets(deltaT, min_sup, file_path, details):
    # Define the filename based on deltaT and min_sup values
    filename = f"{file_path}freq_itemsets_deltaT_{deltaT}_minsup_{min_sup}_type_{details}.csv"
    
    try:
        # Read the CSV file
        freq_itemsets_df = pd.read_csv(filename)
        print(f"Successfully loaded {filename}")
        
        # Convert 'itemsets' column entries from frozenset string format to normal set
        def parse_to_set(item):
            # Removeåtem.strip("frozenset({})")
            
            # Now, split the string by '), (' to separate each tuple, then re-parse each tuple
            items = item.split("), (")
            
            # Convert each string tuple into an actual tuple and collect them into a set
            item_set = set()
            for i in items:
                # Convert each string (e.g., '13, 9') into a tuple (13, 9)
                item_set.add(tuple(map(int, i.split(", "))))
            
            return item_set

        # Apply the parsing function to the 'itemsets' column
        freq_itemsets_df['itemsets'] = freq_itemsets_df['itemsets'].apply(parse_to_set)

        return freq_itemsets_df
    except FileNotFoundError:
        print(f"Error: The file {filename} does not exist.")
        return None

In [7]:
# Test
file_path = "outputs/DS2/"
details = "regular"
freq_itemsets = read_freq_itemsets(1, 0.005, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_1_minsup_0.005_type_regular.csv


ValueError: invalid literal for int() with base 10: 'frozenset({(14'

In [8]:
freq_itemsets = read_freq_itemsets(1, 0.01, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_1_minsup_0.01_type_regular.csv


ValueError: invalid literal for int() with base 10: 'frozenset({(14'

In [45]:
freq_itemsets = read_freq_itemsets(1, 0.015, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_1_minsup_0.015_type_regular.csv


Unnamed: 0,itemsets,support
18,"{(14, 8), (14, 9)}",0.052682
4,"{(13, 9), (14, 9)}",0.038883
3,"{(13, 9), (14, 8)}",0.032934
7,"{(13, 8), (14, 8)}",0.032328
8,"{(13, 8), (13, 9)}",0.029775
34,"{(13, 9), (12, 9)}",0.02894
11,"{(14, 9), (15, 9)}",0.027285
12,"{(14, 8), (15, 8)}",0.026211
31,"{(14, 9), (14, 10)}",0.025792
16,"{(13, 8), (13, 7)}",0.023849


In [44]:
freq_itemsets = read_freq_itemsets(3, 0.005, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_3_minsup_0.005_type_regular.csv


Unnamed: 0,itemsets,support
1326,"{(14, 8), (14, 9)}",0.118816
391,"{(13, 9), (14, 9)}",0.092910
390,"{(13, 9), (14, 8)}",0.087058
1327,"{(13, 8), (14, 8)}",0.078054
393,"{(13, 9), (12, 9)}",0.071078
...,...,...
3980,"{(17, 13), (18, 12), (19, 12)}",0.005001
2711,"{(13, 8), (14, 8), (12, 10), (12, 9), (13, 9)}",0.005001
361,"{(14, 8), (14, 9), (15, 8), (16, 6)}",0.005000
4111,"{(14, 8), (13, 10), (15, 7)}",0.005000


In [23]:
freq_itemsets = read_freq_itemsets(3, 0.01, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_3_minsup_0.01_type_regular.csv


Unnamed: 0,itemsets,support
0,"{(16, 7), (14, 8)}",0.040493
1,"{(16, 7), (15, 8)}",0.038817
2,"{(16, 7), (14, 9)}",0.028972
3,"{(16, 7), (15, 9)}",0.022612
4,"{(16, 7), (13, 8)}",0.019065
...,...,...
1284,"{(9, 11), (8, 10)}",0.010946
1285,"{(9, 10), (8, 10)}",0.017697
1286,"{(4, 1), (5, 1)}",0.010448
1287,"{(20, 11), (19, 11)}",0.013730


In [46]:
freq_itemsets = read_freq_itemsets(3, 0.015, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_3_minsup_0.015_type_regular.csv


Unnamed: 0,itemsets,support
170,"{(14, 8), (14, 9)}",0.118816
50,"{(13, 9), (14, 9)}",0.092910
49,"{(13, 9), (14, 8)}",0.087058
171,"{(13, 8), (14, 8)}",0.078054
52,"{(13, 9), (12, 9)}",0.071078
...,...,...
374,"{(16, 7), (16, 8), (16, 9)}",0.015044
528,"{(8, 14), (9, 14)}",0.015043
416,"{(17, 7), (18, 7)}",0.015042
476,"{(17, 6), (18, 5)}",0.015024


In [25]:
freq_itemsets = read_freq_itemsets(7, 0.005, file_path, details)
freq_itemsets

Successfully loaded outputs/DS2/freq_itemsets_deltaT_7_minsup_0.005_type_regular.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.197765
1,"{(13, 9), (14, 9)}",0.161396
2,"{(13, 9), (14, 8)}",0.158354
3,"{(13, 9), (14, 9), (14, 8)}",0.117987
4,"{(13, 8), (14, 8)}",0.137791
...,...,...
72176,"{(5, 9), (6, 9)}",0.006529
72177,"{(5, 9), (9, 9)}",0.005002
72178,"{(18, 16), (19, 16)}",0.005677
72179,"{(18, 15), (19, 16)}",0.005368


In [47]:
freq_itemsets = read_freq_itemsets(7, 0.01, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_7_minsup_0.01_type_regular.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.197765
1,"{(13, 9), (14, 9)}",0.161396
2,"{(13, 9), (14, 8)}",0.158354
4,"{(13, 8), (14, 8)}",0.137791
1505,"{(13, 9), (12, 9)}",0.127111
...,...,...
8607,"{(12, 10), (12, 11), (13, 11), (14, 11)}",0.010002
1504,"{(8, 8), (7, 8)}",0.010001
4753,"{(13, 9), (12, 7), (14, 9), (15, 9)}",0.010001
3275,"{(8, 9), (10, 8), (9, 9)}",0.010001


In [48]:
freq_itemsets = read_freq_itemsets(7, 0.015, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_7_minsup_0.015_type_regular.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.197765
1,"{(13, 9), (14, 9)}",0.161396
2,"{(13, 9), (14, 8)}",0.158354
4,"{(13, 8), (14, 8)}",0.137791
637,"{(13, 9), (12, 9)}",0.127111
...,...,...
4607,"{(19, 8), (18, 6), (18, 8)}",0.015009
1666,"{(14, 8), (15, 8), (13, 7), (14, 7), (15, 9)}",0.015008
2441,"{(14, 7), (15, 7), (13, 6), (15, 8)}",0.015006
3326,"{(14, 11), (13, 12), (15, 9)}",0.015002


In [49]:
# Test
file_path = "outputs/DS2/"
details = "emergency"
freq_itemsets = read_freq_itemsets(1, 0.005, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_1_minsup_0.005_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.043940
1,"{(13, 9), (14, 9)}",0.035091
102,"{(13, 8), (14, 8)}",0.028720
2,"{(13, 9), (14, 8)}",0.028348
101,"{(13, 8), (13, 9)}",0.027253
...,...,...
170,"{(12, 10), (11, 9)}",0.005019
369,"{(4, 1), (5, 1)}",0.005019
145,"{(16, 9), (14, 9), (15, 9)}",0.005016
57,"{(13, 8), (13, 9), (12, 8)}",0.005016


In [50]:
freq_itemsets = read_freq_itemsets(1, 0.01, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_1_minsup_0.01_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.043940
1,"{(13, 9), (14, 9)}",0.035091
33,"{(13, 8), (14, 8)}",0.028720
2,"{(13, 9), (14, 8)}",0.028348
32,"{(13, 8), (13, 9)}",0.027253
...,...,...
65,"{(11, 8), (11, 9)}",0.010431
70,"{(13, 9), (15, 9)}",0.010366
66,"{(13, 11), (14, 11)}",0.010267
25,"{(14, 7), (14, 9)}",0.010081


In [51]:
freq_itemsets = read_freq_itemsets(1, 0.015, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_1_minsup_0.015_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.04394
1,"{(13, 9), (14, 9)}",0.035091
11,"{(13, 8), (14, 8)}",0.02872
2,"{(13, 9), (14, 8)}",0.028348
10,"{(13, 8), (13, 9)}",0.027253
15,"{(13, 9), (12, 9)}",0.025632
21,"{(14, 9), (15, 9)}",0.024222
24,"{(14, 9), (14, 10)}",0.02368
5,"{(14, 8), (15, 8)}",0.023598
13,"{(13, 8), (13, 7)}",0.021988


In [52]:
freq_itemsets = read_freq_itemsets(3, 0.005, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_3_minsup_0.005_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.099532
1,"{(13, 9), (14, 9)}",0.083067
2,"{(13, 9), (14, 8)}",0.074938
1336,"{(13, 8), (14, 8)}",0.069478
1335,"{(13, 8), (13, 9)}",0.065006
...,...,...
2370,"{(13, 8), (12, 11), (13, 9)}",0.005003
3379,"{(12, 7), (12, 8), (15, 8)}",0.005000
628,"{(11, 12), (10, 10), (11, 11)}",0.005000
434,"{(10, 11), (11, 11), (10, 14)}",0.005000


In [53]:
freq_itemsets = read_freq_itemsets(3, 0.01, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_3_minsup_0.01_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.099532
1,"{(13, 9), (14, 9)}",0.083067
2,"{(13, 9), (14, 8)}",0.074938
391,"{(13, 8), (14, 8)}",0.069478
390,"{(13, 8), (13, 9)}",0.065006
...,...,...
382,"{(14, 6), (14, 7), (14, 5)}",0.010016
898,"{(17, 7), (18, 8)}",0.010016
409,"{(18, 6), (18, 3)}",0.010000
86,"{(13, 8), (15, 7), (15, 8)}",0.010000


In [54]:
freq_itemsets = read_freq_itemsets(3, 0.015, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_3_minsup_0.015_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.099532
1,"{(13, 9), (14, 9)}",0.083067
2,"{(13, 9), (14, 8)}",0.074938
191,"{(13, 8), (14, 8)}",0.069478
190,"{(13, 8), (13, 9)}",0.065006
...,...,...
107,"{(9, 11), (11, 11)}",0.015047
183,"{(14, 6), (15, 7)}",0.015044
117,"{(11, 12), (10, 13)}",0.015041
167,"{(9, 14), (9, 15)}",0.015028


In [37]:
freq_itemsets = read_freq_itemsets(7, 0.005, file_path, details)
freq_itemsets

Successfully loaded outputs/DS2/freq_itemsets_deltaT_7_minsup_0.005_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.167505
1,"{(13, 9), (14, 9)}",0.145348
2,"{(13, 9), (14, 8)}",0.137687
3,"{(13, 9), (14, 9), (14, 8)}",0.099817
4,"{(13, 8), (14, 8)}",0.123134
...,...,...
47940,"{(15, 2), (15, 3)}",0.006985
47941,"{(16, 3), (15, 2)}",0.006518
47942,"{(16, 2), (15, 2)}",0.006283
47943,"{(14, 3), (15, 2)}",0.005047


In [55]:
freq_itemsets = read_freq_itemsets(7, 0.01, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_7_minsup_0.01_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.167505
1,"{(13, 9), (14, 9)}",0.145348
2,"{(13, 9), (14, 8)}",0.137687
4,"{(13, 8), (14, 8)}",0.123134
5,"{(13, 8), (13, 9)}",0.115393
...,...,...
3320,"{(12, 10), (16, 10), (15, 10)}",0.010004
7718,"{(13, 14), (14, 9)}",0.010004
1463,"{(11, 13), (10, 12), (9, 14)}",0.010004
1739,"{(10, 10), (9, 14)}",0.010004


In [56]:
freq_itemsets = read_freq_itemsets(7, 0.015, file_path, details)
freq_itemsets.sort_values(by = 'support', ascending = False)

Successfully loaded outputs/DS2/freq_itemsets_deltaT_7_minsup_0.015_type_emergency.csv


Unnamed: 0,itemsets,support
0,"{(14, 8), (14, 9)}",0.167505
1,"{(13, 9), (14, 9)}",0.145348
2,"{(13, 9), (14, 8)}",0.137687
4,"{(13, 8), (14, 8)}",0.123134
5,"{(13, 8), (13, 9)}",0.115393
...,...,...
3340,"{(8, 14), (9, 13)}",0.015007
3468,"{(16, 7), (17, 6), (17, 7), (17, 8)}",0.015007
3045,"{(18, 12), (19, 12)}",0.015002
931,"{(9, 12), (9, 13), (10, 12)}",0.015002
