In [None]:
#import all necessary packages
import pandas as pd
import hypernetx as hnx
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, fpmax
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib.ticker import MaxNLocator
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Step 1. Generating Transactions from Mobility Dataset

In [None]:
# Path to the mobility dataframe with movements of 100,000 individuals over 75-day business as usual period
dataset_path = 'YJMob100k/yjmob100k-dataset1.csv' 
# The original dataset has 200x200 grid (area of a grid cell = 1/2 km x1/2 km = 0.25 km^2)
# Scaling_factor 10 (200/10 x 200/10) aggregates it to a 20x20 grid (area of a grid cell = 5 km x 5 km = 25 km^2)
scaling_factor = 10
# List of minimum support thresholds considered [1%, 1.5%, 2%]
min_sups = [0.005, 0.01, 0.015] #minimum support values corresponding to 1% and 1.5% respectively
deltaT_vals = [1, 3, 7] #Sliding window lengths

In [None]:
def generate_set_of_transactions_by_deltaT(dataset_path, scaling_factor, deltaT):
    mob_df = pd.read_csv(dataset_path)
    print("Delta T = "+str(deltaT))
    
    #Spatial aggregation of 200x200 grid to 20x20 grid
    mob_df['grouped_x'] = (mob_df['x'] - 1) // scaling_factor + 1
    mob_df['grouped_y'] = (mob_df['y'] - 1) // scaling_factor + 1   
    
    #start and end days in the dataset
    min_day = mob_df['d'].min()
    max_day = mob_df['d'].max()
    print(min_day, max_day)
    
    #sliding window of deltaT length over min_day and max_day to obtain transactions
    transactions = []
    for start_day in range(min_day, max_day - deltaT + 2):
        end_day = start_day + deltaT - 1
        window_df = mob_df[ (mob_df['d'] >= start_day) & (mob_df['d'] <= end_day) ]
        mob_df_grouped = window_df.groupby('uid')
    
        if start_day % 10 == 0:
            print(start_day)
        for uid, record in mob_df_grouped:
            transaction = frozenset(zip(record['grouped_x'], record['grouped_y']))
            transactions.append(transaction)

    filename = f"transactions_scaling_{scaling_factor}_deltaT_{deltaT}.txt"

    with open(filename, "w") as f:
        for transaction in transactions:
            transaction_str = "; ".join(map(str, transaction))  # Convert frozenset to a string
            f.write(transaction_str + "\n")
    
    print(f"A total of {len(transactions)} transactions saved to {filename}")
    
    return transactions

In [None]:
for deltaT in deltaT_vals:
    transactions_per_deltaT = generate_set_of_transactions_by_deltaT(dataset_path, scaling_factor, deltaT)

In [None]:
def read_transactions_from_file(filename):
    transactions = []
    
    with open(filename, "r") as f:
        for line in f:
            # Strip newline and split by commas to get pairs as strings
            pairs = line.strip().split("; ")
            
            # Convert each pair from string to a tuple of integers
            transaction = frozenset(
                tuple(map(int, pair.strip("()").split(","))) for pair in pairs
            )
            transactions.append(transaction)
    
    print(f"Loaded {len(transactions)} transactions from {filename}")
    return transactions

# Step 2. Computing Frequent Itemsets from Transactions Using FPGrowth

In [None]:
# Computes and returns all frequent itemsets given min_support and threshold. min_support is the 
# minimum fraction of total transactions in which the itemset appears; Set min_support to a suitable 
# threshold (for instance, 0.05 is 5%, 0.01 is 1%, 0.005 is 0.5%, 0.001 is 0.1%). min_itemset_size is the
# minimum number of items in any itemset. By setting it to 2, we get frequent itemsets which are of size 
# at least 2 (two or more locations are visited in the itemset).
def get_frequent_itemsets(transactions, min_sup, min_itemset_size):
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)  
    frequent_itemsets = fpgrowth(df, min_support=min_sup, use_colnames=True)
    print("Total frequent itemsets: ", len(frequent_itemsets))
    frequent_itemsets_filtered = frequent_itemsets[
        frequent_itemsets['itemsets'].apply(lambda x: len(x) >= min_itemset_size)
    ]
    print("Filtered frequent itemsets: ", len(frequent_itemsets_filtered)) 
    print("minimum support, no. of frequent itemsets: ", min_sup, len(frequent_itemsets_filtered))
    return frequent_itemsets_filtered

In [None]:
def get_filtered_frequent_itemsets(frequent_itemsets, min_sup):
    frequent_itemsets_filtered = frequent_itemsets[
        frequent_itemsets['support'] >= min_sup
    ]
    print("minimum support, no. of frequent itemsets: ", min_sup, len(frequent_itemsets_filtered))
    return frequent_itemsets_filtered

In [None]:
def write_all_freq_itemsets_by_minsup_and_deltaT(transactions, deltaT, min_sups):
    print("Minimum support values:", min_sups)
    
    # Initialize a dictionary to store all itemsets by min_sup for reference or further processing, if needed.
    freq_itemsets_by_minsup = {}

    for i, min_sup in enumerate(min_sups):
        print(f"Processing min_sup={min_sup}")
        
        # Generate or filter frequent itemsets based on min_sup value
        if i == 0:
            freq_itemsets = get_frequent_itemsets(transactions, min_sup, 2)
        else:
            freq_itemsets = get_filtered_frequent_itemsets(freq_itemsets_by_minsup[min_sups[0]], min_sup)
        
        # Store the itemsets in dictionary for each min_sup value
        freq_itemsets_by_minsup[min_sup] = freq_itemsets

        # Convert frequent itemsets to a DataFrame
        freq_itemsets_df = pd.DataFrame({
            'itemsets': freq_itemsets['itemsets'],
            'support': freq_itemsets['support']
        })

        filename = f"freq_itemsets_deltaT_{deltaT}_minsup_{min_sup}.csv"
        print(f"Writing itemsets to {filename}")

        freq_itemsets_df.to_csv(filename, index=False)

    print(f"All frequent itemsets have been written to CSV files for {deltaT} and {min_sups}.")

In [None]:
for i in range(len(deltaT_vals)):
    deltaT = deltaT_vals[i]
    filename = f"transactions_scaling_{scaling_factor}_deltaT_{deltaT}.txt"
    transactions = read_transactions_from_file(filename)
    print(f"Number of transactions: {len(transactions)}")
    write_all_freq_itemsets_by_minsup_and_deltaT(transactions, deltaT, min_sups)

In [None]:
def read_freq_itemsets(deltaT, min_sup):
    # Define the filename based on deltaT and min_sup values
    filename = f"outputs/DS1/freq_itemsets_deltaT_{deltaT}_minsup_{min_sup}.csv"
    
    try:
        # Read the CSV file
        freq_itemsets_df = pd.read_csv(filename)
        print(f"Successfully loaded {filename}")
        
        # Convert 'itemsets' column entries from frozenset string format to normal set
        def parse_to_set(item):
            # Remove 'frozenset({' and '})' to get the string representation of the set's contents
            item = item.strip("frozenset({})")
            
            # Now, split the string by '), (' to separate each tuple, then re-parse each tuple
            items = item.split("), (")
            
            # Convert each string tuple into an actual tuple and collect them into a set
            item_set = set()
            for i in items:
                # Convert each string (e.g., '13, 9') into a tuple (13, 9)
                item_set.add(tuple(map(int, i.split(", "))))
            
            return item_set

        # Apply the parsing function to the 'itemsets' column
        freq_itemsets_df['itemsets'] = freq_itemsets_df['itemsets'].apply(parse_to_set)

        return freq_itemsets_df
    except FileNotFoundError:
        print(f"Error: The file {filename} does not exist.")
        return None

In [None]:
# Test
freq_itemsets = read_freq_itemsets(1, 0.015)
freq_itemsets