In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

## Preprocessing data

In [9]:
import os
PATH = os.getcwd()
files = ['POI_datacategories.csv','POIdata_cityB.csv','POIdata_cityC.csv','POIdata_cityD.csv', 'POIdata_cityA.csv']
filepath = []
for file in files:
    filepath.append(os.path.join(PATH, 'dataset', file))
filepath

['c:\\NTU\\2024Y4S1\\sc4020\\SC4020-Project-2\\dataset\\POI_datacategories.csv',
 'c:\\NTU\\2024Y4S1\\sc4020\\SC4020-Project-2\\dataset\\POIdata_cityB.csv',
 'c:\\NTU\\2024Y4S1\\sc4020\\SC4020-Project-2\\dataset\\POIdata_cityC.csv',
 'c:\\NTU\\2024Y4S1\\sc4020\\SC4020-Project-2\\dataset\\POIdata_cityD.csv',
 'c:\\NTU\\2024Y4S1\\sc4020\\SC4020-Project-2\\dataset\\POIdata_cityA.csv']

In [26]:
df_poicat = pd.read_csv(filepath[0],header=None)
df_citya = pd.read_csv(filepath[4])
df_cityb = pd.read_csv(filepath[1])
df_cityc = pd.read_csv(filepath[2])
df_cityd = pd.read_csv(filepath[3])

In [27]:
df_poicat = df_poicat.rename(columns={0:'cat'})
df_poicat.index = pd.RangeIndex(start=1, stop=(len(df_poicat) + 1), step=1)# resetting the index
df_poicat.head()

Unnamed: 0,cat
1,Food
2,Shopping
3,Entertainment
4,Japanese restaurant
5,Western restaurant


Statistics of the Dataset

In [None]:
print("Shape of POI categories dataframe:", df_poicat.shape)
print("Shape of City A dataframe:", df_citya.shape)
print("Shape of City B dataframe:", df_cityb.shape)
print("Shape of City C dataframe:", df_cityc.shape)
print("Shape of City D dataframe:", df_cityd.shape)

# print(df_citya.isnull().sum())

Shape of POI categories dataframe: (85, 1)
Shape of City A dataframe: (221248, 4)
Shape of City B dataframe: (56181, 4)
Shape of City C dataframe: (39064, 4)
Shape of City D dataframe: (72794, 4)


Add a coordinate column for each csv

In [33]:
df_citya['coord'] = (df_citya.apply(lambda row: (row['x'], row['y']), axis=1))
df_cityb['coord'] = (df_cityb.apply(lambda row: (row['x'], row['y']), axis=1))
df_cityc['coord'] = (df_cityb.apply(lambda row: (row['x'], row['y']), axis=1))
df_cityd['coord'] = (df_cityb.apply(lambda row: (row['x'], row['y']), axis=1))

In [34]:
basket_a = df_citya.groupby(['coord','category'])['POI_count'].sum().unstack().fillna(0).reset_index().set_index('coord')
basket_b = df_cityb.groupby(['coord','category'])['POI_count'].sum().unstack().fillna(0).reset_index().set_index('coord')
basket_c = df_cityc.groupby(['coord','category'])['POI_count'].sum().unstack().fillna(0).reset_index().set_index('coord')
basket_d = df_cityd.groupby(['coord','category'])['POI_count'].sum().unstack().fillna(0).reset_index().set_index('coord')
df_cityb.head()

Unnamed: 0,x,y,category,POI_count,coord
0,1,1,79,2,"(1, 1)"
1,1,1,84,1,"(1, 1)"
2,1,2,82,1,"(1, 2)"
3,1,3,54,1,"(1, 3)"
4,1,3,74,1,"(1, 3)"


In [35]:
def encode_unit(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

In [36]:
basket_asets = basket_a.applymap(encode_unit)
basket_bsets = basket_b.applymap(encode_unit)
basket_csets = basket_c.applymap(encode_unit)
basket_dsets = basket_d.applymap(encode_unit)

## Algorithm Implementation

In [38]:
def apriori_algo(df, minsup, minconf):
    
    # Step 1: Generate frequent itemsets
    def get_frequent_itemsets(df, minsup):
        # Count support for individual items
        item_count = df.sum()
        freq_items = item_count[item_count >= minsup].index.tolist()
        
        # Create frequent itemsets for single items
        freq_itemsets = {frozenset([item]): item_count[item] for item in freq_items}
        
        # Generate larger itemsets
        k = 2
        while True:
            candidate_itemsets = set()
            for itemset1 in freq_itemsets.keys():
                for itemset2 in freq_itemsets.keys():
                    union_set = itemset1.union(itemset2)
                    if len(union_set) == k:
                        candidate_itemsets.add(union_set)
            if not candidate_itemsets:
                break
            
            # Count support for candidate itemsets
            itemset_count = {}
            for itemset in candidate_itemsets:
                itemset_count[itemset] = df[list(itemset)].all(axis=1).sum()
            
            # Keep only those that meet the minsup threshold
            freq_itemsets.update({itemset: count for itemset, count in itemset_count.items() if count >= minsup})
            k += 1
        
        return freq_itemsets
    
    # Step 2: Generate rules from frequent itemsets
    def generate_rules(freq_itemsets, minconf):
        rules = []
        for itemset, support in freq_itemsets.items():
            for k in range(1, len(itemset)):
                for antecedent in combinations(itemset, k):
                    antecedent = frozenset(antecedent)
                    consequent = itemset - antecedent
                    if len(consequent) > 0:
                        # Calculate confidence
                        confidence = support / freq_itemsets[antecedent]
                        if confidence >= minconf:
                            rules.append((antecedent, consequent, confidence, support))
        return rules

    # Get frequent itemsets
    freq_itemsets = get_frequent_itemsets(df, minsup)
    # Generate rules
    rules = generate_rules(freq_itemsets, minconf)
    
    return rules if rules else None

## Test

In [39]:
data = { 'Milk': [1, 0, 1, 1, 0], 
        'Bread': [1, 1, 0, 1, 1], 
        'Butter': [0, 1, 1, 1, 0], 
        'Cheese': [0, 1, 1, 0, 1], 
        'Eggs': [1, 0, 1, 0, 1] } 
df_dummy = pd.DataFrame(data)
df_dummy

Unnamed: 0,Milk,Bread,Butter,Cheese,Eggs
0,1,1,0,0,1
1,0,1,1,1,0
2,1,0,1,1,1
3,1,1,1,0,0
4,0,1,0,1,1


In [40]:
minsup = 2  # Minimum support threshold
minconf = 0.1 # Minimum confidence threshold
rules = apriori_algo(df_dummy, minsup, minconf)

# Print rules
if rules:
    for antecedent, consequent, confidence, support in rules:
        print(f"Rule: {set(antecedent)} -> {set(consequent)}, "
              f"Confidence: {confidence:.2f}, Support: {support}")
else:
    print("No results found")

Rule: {'Butter'} -> {'Cheese'}, Confidence: 0.67, Support: 2
Rule: {'Cheese'} -> {'Butter'}, Confidence: 0.67, Support: 2
Rule: {'Bread'} -> {'Cheese'}, Confidence: 0.50, Support: 2
Rule: {'Cheese'} -> {'Bread'}, Confidence: 0.67, Support: 2
Rule: {'Cheese'} -> {'Eggs'}, Confidence: 0.67, Support: 2
Rule: {'Eggs'} -> {'Cheese'}, Confidence: 0.67, Support: 2
Rule: {'Butter'} -> {'Bread'}, Confidence: 0.67, Support: 2
Rule: {'Bread'} -> {'Butter'}, Confidence: 0.50, Support: 2
Rule: {'Bread'} -> {'Milk'}, Confidence: 0.50, Support: 2
Rule: {'Milk'} -> {'Bread'}, Confidence: 0.67, Support: 2
Rule: {'Butter'} -> {'Milk'}, Confidence: 0.67, Support: 2
Rule: {'Milk'} -> {'Butter'}, Confidence: 0.67, Support: 2
Rule: {'Bread'} -> {'Eggs'}, Confidence: 0.50, Support: 2
Rule: {'Eggs'} -> {'Bread'}, Confidence: 0.67, Support: 2
Rule: {'Milk'} -> {'Eggs'}, Confidence: 0.67, Support: 2
Rule: {'Eggs'} -> {'Milk'}, Confidence: 0.67, Support: 2


In [41]:
def map_items_to_categories(rules, df_poicat):
    mapped_df = pd.DataFrame(columns=['Antecedent', 'Consequent', 'Confidence', 'Support'])
    for antecedent, consequent, confidence, support in rules:
        # Mapping antecedents
        antecedent_categories = [df_poicat.loc[item, 'cat'] for item in antecedent]
        consequent_categories = [df_poicat.loc[item, 'cat'] for item in consequent]
        
        # Create a formatted rule string
        mapped_df = mapped_df.append({
            'Antecedent': ', '.join(antecedent_categories),
            'Consequent': ', '.join(consequent_categories),
            'Confidence': confidence,
            'Support': support
        }, ignore_index=True)
    return mapped_df

In [42]:
pd.options.display.max_rows = None

In [43]:
basket_bsets.sum().describe()

count      84.000000
mean      668.821429
std       763.763844
min         1.000000
25%       118.500000
50%       402.000000
75%       965.750000
max      3422.000000
dtype: float64

In [31]:
# City B
basket_bsets.sum().quantile(0.60)

567.5999999999999

In [32]:
minsup = 570
minconf = 0.90
rules = apriori_algo(basket_bsets,minsup=minsup, minconf=minconf)
if rules:
    mapped_rules = map_items_to_categories(rules, df_poicat)
    display(mapped_rules)
else:
    print("No results found")

Unnamed: 0,Antecedent,Consequent,Confidence,Support
0,"Hospital, Japanese restaurant",Hair Salon,0.913462,570
1,"Transit Station, Convenience Store",Bank,0.913043,588
2,"Heavy Industry, Hospital, Laundry",Hair Salon,0.930308,574
3,"Transit Station, Hospital, Laundry",Hair Salon,0.904335,605
4,"Hospital, Hair Salon, Laundry",Transit Station,0.904335,605
5,"Transit Station, Real Estate, Hospital",Hair Salon,0.90755,589
6,"Real Estate, Hospital, Hair Salon",Transit Station,0.900612,589
7,"Heavy Industry, Hospital, Building Material",Transit Station,0.904984,581
8,"Bank, Real Estate, Laundry",Hair Salon,0.914692,579
9,"Japanese restaurant, Hair Salon, Building Mate...",Transit Station,0.904687,579


In [33]:
basket_csets.sum().describe()

count      85.000000
mean      459.576471
std       439.294222
min         1.000000
25%       111.000000
50%       310.000000
75%       684.000000
max      1858.000000
dtype: float64

In [42]:
# City C
basket_csets.sum().quantile(0.80)

822.2

In [43]:
minsup = 822
minconf = 0.90
rules = apriori_algo(basket_csets,minsup=minsup, minconf=minconf)
if rules:
    mapped_rules = map_items_to_categories(rules, df_poicat)
    display(mapped_rules)
else:
    print("No results found")

Unnamed: 0,Antecedent,Consequent,Confidence,Support
0,Bank,Transit Station,0.913177,894
1,Hospital,Transit Station,0.92449,906
2,Hair Salon,Transit Station,0.906222,1005
3,Fishing,Transit Station,0.907127,840
4,"Real Estate, Hair Salon",Transit Station,0.936422,869
5,"Real Estate, Building Material",Transit Station,0.910058,941
6,"Hair Salon, Building Material",Transit Station,0.941043,830
7,"Accountant Office, Park",Transit Station,0.916405,877
8,"Real Estate, Home Appliances",Transit Station,0.90592,857
9,"Real Estate, Park",Transit Station,0.902104,986


In [44]:
basket_dsets.sum().describe()

count      84.000000
mean      866.595238
std       969.939386
min         1.000000
25%       160.500000
50%       515.500000
75%      1232.750000
max      3966.000000
dtype: float64

In [49]:
# City D
basket_dsets.sum().quantile(0.55)

625.6

In [50]:
minsup = 625
minconf = 0.90
rules = apriori_algo(basket_dsets,minsup=minsup, minconf=minconf)
if rules:
    mapped_rules = map_items_to_categories(rules, df_poicat)
    display(mapped_rules)
else:
    print("No results found")

Unnamed: 0,Antecedent,Consequent,Confidence,Support
0,"Hospital, Fishing",Hair Salon,0.902718,631
1,"Bank, Real Estate, Laundry",Hair Salon,0.903872,677
2,"Grocery Store, Hospital, Laundry",Hair Salon,0.909871,636
3,"Transit Station, Grocery Store, Hospital",Hair Salon,0.905233,640
4,"Bank, Hospital, Laundry",Hair Salon,0.905388,689
5,"Hospital, Bank, Real Estate",Hair Salon,0.918486,631
6,"Grocery Store, Accountant Office, Laundry",Hair Salon,0.90922,641
7,"Hospital, Real Estate, Laundry",Hair Salon,0.910804,725
8,"Hospital, Accountant Office, Laundry",Hair Salon,0.913098,725
9,"Bank, Hospital, Accountant Office",Hair Salon,0.909871,636


In [52]:
# City A
basket_asets.sum().describe()

count       85.000000
mean      2602.917647
std       2647.478415
min          1.000000
25%        559.000000
50%       1584.000000
75%       3803.000000
max      11263.000000
dtype: float64

In [57]:
basket_asets.sum().quantile(0.63)

2756.7600000000007

In [58]:
minsup = 2756
minconf = 0.90
rules = apriori_algo(basket_asets,minsup=minsup, minconf=minconf)
if rules:
    mapped_rules = map_items_to_categories(rules, df_poicat)
    display(mapped_rules)
else:
    print("No results found")

Unnamed: 0,Antecedent,Consequent,Confidence,Support
0,"Hospital, Japanese restaurant",Hair Salon,0.910564,2810
1,"Chiropractic, Laundry",Hair Salon,0.902312,2771
2,"Hospital, Fishing",Hair Salon,0.901459,2781
3,"Hospital, Laundry , Building Material",Real Estate,0.910091,2814
4,"Heavy Industry, Hospital, Laundry",Hair Salon,0.903074,2879
5,"Transit Station, Real Estate, Hospital",Hair Salon,0.904139,2971
6,"Bank, Real Estate, Laundry",Hair Salon,0.912987,2791
7,"Real Estate, Laundry , Fishing",Hair Salon,0.915755,2761
8,"Hair Salon, Laundry , Fishing",Real Estate,0.90703,2761
9,"Japanese restaurant, Hair Salon, Building Mate...",Real Estate,0.900609,2809
