# Loading Dataset

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('https://raw.githubusercontent.com/okzapradhana/simple-apriori-algorithm/master/dataset_apriori.csv')

In [3]:
dataset

Unnamed: 0,tid,items
0,1,"Handphone,Laptop"
1,2,"Handphone,Charger,Laptop"
2,3,"Powerbank,Laptop,Charger,Handphone"
3,4,"Tablet,Laptop,Handphone"
4,5,"Handphone,Charger,Tablet"
5,6,"Tablet,Powerbank"
6,7,"Handphone,Laptop,Tablet,Charger"
7,8,"Charger,Handphone"
8,9,"Handphone,Powerbank"
9,10,"Laptop,Charger,Powerbank"


# Preprocessing of Dataset

In [4]:
#splitting columns of dataset into two different dataframe
df_items = dataset['items']
df_tid = dataset['tid']

In [5]:
df_items

0                      Handphone,Laptop
1              Handphone,Charger,Laptop
2    Powerbank,Laptop,Charger,Handphone
3               Tablet,Laptop,Handphone
4              Handphone,Charger,Tablet
5                      Tablet,Powerbank
6       Handphone,Laptop,Tablet,Charger
7                     Charger,Handphone
8                   Handphone,Powerbank
9              Laptop,Charger,Powerbank
Name: items, dtype: object

In [6]:
df_tid

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: tid, dtype: int64

In [7]:
#assigning unique indices to all the unique items
dictionaries = {'Handphone': 1, 'Laptop': 2, 'Charger': 3, 'Powerbank': 4, 'Tablet': 5 }

In [8]:
comma_splitted_df = df_items.apply(lambda x: x.split(','))

In [9]:
comma_splitted_df

0                        [Handphone, Laptop]
1               [Handphone, Charger, Laptop]
2    [Powerbank, Laptop, Charger, Handphone]
3                [Tablet, Laptop, Handphone]
4               [Handphone, Charger, Tablet]
5                        [Tablet, Powerbank]
6       [Handphone, Laptop, Tablet, Charger]
7                       [Charger, Handphone]
8                     [Handphone, Powerbank]
9               [Laptop, Charger, Powerbank]
Name: items, dtype: object

In [10]:
#converting splitted data into integral data
numbered_col = []
for i in range(len(comma_splitted_df)):
    list_numbered = list(map(lambda x: dictionaries[x], comma_splitted_df[i]))
    sort_numbered = sorted(list_numbered)
    numbered_col.append(sort_numbered)

numbered_col

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 5],
 [1, 3, 5],
 [4, 5],
 [1, 2, 3, 5],
 [1, 3],
 [1, 4],
 [2, 3, 4]]

# Creating Dataframe from Transformed Data

In [11]:
dict_data = {'items': numbered_col}
df = pd.DataFrame.from_dict(dict_data)

In [12]:
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [13]:
pd.concat([df_tid, df], axis=1)

Unnamed: 0,tid,items
0,1,"[1, 2]"
1,2,"[1, 2, 3]"
2,3,"[1, 2, 3, 4]"
3,4,"[1, 2, 5]"
4,5,"[1, 3, 5]"
5,6,"[4, 5]"
6,7,"[1, 2, 3, 5]"
7,8,"[1, 3]"
8,9,"[1, 4]"
9,10,"[2, 3, 4]"


In [14]:
items = []
for i in range(len(df)):
    for j in range(len(df['items'][i])):
        items.append(df['items'][i][j])
items

[1,
 2,
 1,
 2,
 3,
 1,
 2,
 3,
 4,
 1,
 2,
 5,
 1,
 3,
 5,
 4,
 5,
 1,
 2,
 3,
 5,
 1,
 3,
 1,
 4,
 2,
 3,
 4]

# Creating First Candidate (C1)

In [15]:
#Get unique elements from list/array
unique_item = set(items)
unique_item

{1, 2, 3, 4, 5}

In [16]:
#Convert it into list
list_unique_item = list(unique_item)
list_unique_item

[1, 2, 3, 4, 5]

In [17]:
#counting frequency of every unique items
count_unique = []
for value in (list_unique_item):
    count_unique.append((value, items.count(value)))
count_unique

[(1, 8), (2, 6), (3, 6), (4, 4), (5, 4)]

In [18]:
candidate1_df = pd.DataFrame(count_unique, columns=["itemset", "sup"])

candidate1_df

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


# Creating first Frequent Itemset (L1)

In [19]:
#filtering items having minimum support count 2
def filter_sup(candidate):
    minimum_sup = 2
    filtering = candidate['sup'] > minimum_sup
    freq = candidate[filtering]
    return freq

In [20]:
freq_itemset1 = filter_sup(candidate1_df)
freq_itemset1

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


# Create the Second Candidate (C2)

SELF JOIN

In [21]:
import numpy
def self_join(prev_freq_itemset):
    self_join_candidate = []
    for i in range(len(prev_freq_itemset['itemset'])):
        for j in range((i+1), len(prev_freq_itemset['itemset'])):
            itemset_i = prev_freq_itemset['itemset'][i]
            itemset_j = prev_freq_itemset['itemset'][j]
            if(type(itemset_i) == numpy.int64 and type(itemset_j) == numpy.int64):
                itemset_i = {itemset_i}
                itemset_j = {itemset_j}
            union_candidate = itemset_i.union(itemset_j)

            if union_candidate not in self_join_candidate:
                self_join_candidate.append(union_candidate)
    return self_join_candidate

In [22]:
candidate2_list = self_join(freq_itemset1)
candidate2_list

[{1, 2},
 {1, 3},
 {1, 4},
 {1, 5},
 {2, 3},
 {2, 4},
 {2, 5},
 {3, 4},
 {3, 5},
 {4, 5}]

In [23]:
count_candidate2 = []

#Set the Initial value of Second Count Candidate (C2)
for i in range(len(candidate2_list)):
    count_candidate2.append((candidate2_list[i], 0))

count_candidate2

[({1, 2}, 0),
 ({1, 3}, 0),
 ({1, 4}, 0),
 ({1, 5}, 0),
 ({2, 3}, 0),
 ({2, 4}, 0),
 ({2, 5}, 0),
 ({3, 4}, 0),
 ({3, 5}, 0),
 ({4, 5}, 0)]

In [24]:
initial_df_candidate = pd.DataFrame(count_candidate2, columns=['itemset', 'sup'])
initial_df_candidate

Unnamed: 0,itemset,sup
0,"{1, 2}",0
1,"{1, 3}",0
2,"{1, 4}",0
3,"{1, 5}",0
4,"{2, 3}",0
5,"{2, 4}",0
6,"{2, 5}",0
7,"{3, 4}",0
8,"{3, 5}",0
9,"{4, 5}",0


In [25]:
#adding it with 1 whenever we find every candidate is a subset from Database D


def count_support(database_dataframe, prev_candidate_list):
    #initial_df_candidate['sup'] = 0 #set All value into 0 only for initial value for consistency value when running this cell everytime.
    count_prev_candidate = []

    #Set the Initial value of Previous Candidate
    for i in range(len(prev_candidate_list)):
        count_prev_candidate.append((prev_candidate_list[i], 0))
    
    df_candidate = pd.DataFrame(count_prev_candidate, columns=['itemset', 'sup'])
    print('Database D dataframe\n', database_dataframe)
    print('(Initial) Dataframe from Candidate with All zeros sup\n', df_candidate)
    
    for i in range(len(database_dataframe)):
        for j in range(len(count_prev_candidate)):
            #using issubset() function to check whether every itemset is a subset of Database or not
            if (df_candidate['itemset'][j]).issubset(set(database_dataframe['items'][i])): 
                df_candidate.loc[j, 'sup'] += 1
            
    return df_candidate

In [26]:
count_candidate2_df = count_support(df, candidate2_list)

Database D dataframe
           items
0        [1, 2]
1     [1, 2, 3]
2  [1, 2, 3, 4]
3     [1, 2, 5]
4     [1, 3, 5]
5        [4, 5]
6  [1, 2, 3, 5]
7        [1, 3]
8        [1, 4]
9     [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
   itemset  sup
0  {1, 2}    0
1  {1, 3}    0
2  {1, 4}    0
3  {1, 5}    0
4  {2, 3}    0
5  {2, 4}    0
6  {2, 5}    0
7  {3, 4}    0
8  {3, 5}    0
9  {4, 5}    0


In [27]:
count_candidate2_df

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
2,"{1, 4}",2
3,"{1, 5}",3
4,"{2, 3}",4
5,"{2, 4}",2
6,"{2, 5}",2
7,"{3, 4}",2
8,"{3, 5}",2
9,"{4, 5}",1


# Creating Second Frequent Itemset (L2)

In [28]:
#Filter the itemset based on minimum support (occurences of items)
freq_itemset2 = filter_sup(count_candidate2_df)
freq_itemset2

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
3,"{1, 5}",3
4,"{2, 3}",4


In [29]:
freq_itemset2_reset = freq_itemset2.reset_index(drop=True)

In [30]:
freq_itemset2_reset

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
2,"{1, 5}",3
3,"{2, 3}",4


# Creating the Third Candidate (C3) - Using the Candidate Forming Technique 

SELF JOIN

In [31]:
print(freq_itemset2_reset)
self_join_result = self_join(freq_itemset2_reset)
print('self join result')
print(self_join_result)

  itemset  sup
0  {1, 2}    5
1  {1, 3}    5
2  {1, 5}    3
3  {2, 3}    4
self join result
[{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}]


PRUNING

In [32]:
def get_subset(candidate):
    temp = []
    final = []
    for i in range(len(candidate)):
        for j in range(len(candidate)):
            if i != j:
                temp.append(candidate[j])
        temp_set = set(temp)
        final.append(temp_set)
        temp.clear()
    print('Subset from {} : {}'.format(candidate, final))
    return final

def pruning(candidate_set, prev_freq_itemset):
    print('Candidate set', candidate_set)
    temp = []
    
    for idx, value in enumerate(candidate_set):
        list_candidate = list(value)
        temp_candidate = (get_subset(list_candidate))
        
        for temp_item in temp_candidate:
            print('Temp item', temp_item)
            check = temp_item == prev_freq_itemset['itemset']
            print('\nCheck candidate from Previous Frequent Itemset\n', check)
            
            if any(check) == False:
                print(any(check))
                print('Val', value)
            else:
                print('\nAll of {} subset contained in \n{}'.format(candidate_set, prev_freq_itemset))
                if value not in temp:
                    temp.append(value)
                
    return temp

In [33]:
freq_itemset2_reset

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
2,"{1, 5}",3
3,"{2, 3}",4


In [34]:
subset = [{2, 3}, {1, 3}, {1, 2}]

In [35]:
self_join_result

[{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}]

In [36]:
for i in range(len(self_join_result)):
    get_subset(list(self_join_result[i]))

Subset from [1, 2, 3] : [{2, 3}, {1, 3}, {1, 2}]
Subset from [1, 2, 5] : [{2, 5}, {1, 5}, {1, 2}]
Subset from [1, 3, 5] : [{3, 5}, {1, 5}, {1, 3}]
Subset from [1, 2, 3, 5] : [{2, 3, 5}, {1, 3, 5}, {1, 2, 5}, {1, 2, 3}]


In [37]:
freq_itemset2_reset

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
2,"{1, 5}",3
3,"{2, 3}",4


In [38]:
for item in subset:
    print(item)
    check = item == freq_itemset2_reset['itemset']
    print('Check', any(check))

{2, 3}
Check True
{1, 3}
Check True
{1, 2}
Check True


In [39]:
self_join_result

[{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}]

In [40]:
candidate3_list = pruning(self_join_result, freq_itemset2_reset)

Candidate set [{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}]
Subset from [1, 2, 3] : [{2, 3}, {1, 3}, {1, 2}]
Temp item {2, 3}

Check candidate from Previous Frequent Itemset
 0    False
1    False
2    False
3     True
Name: itemset, dtype: bool

All of [{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}] subset contained in 
  itemset  sup
0  {1, 2}    5
1  {1, 3}    5
2  {1, 5}    3
3  {2, 3}    4
Temp item {1, 3}

Check candidate from Previous Frequent Itemset
 0    False
1     True
2    False
3    False
Name: itemset, dtype: bool

All of [{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}] subset contained in 
  itemset  sup
0  {1, 2}    5
1  {1, 3}    5
2  {1, 5}    3
3  {2, 3}    4
Temp item {1, 2}

Check candidate from Previous Frequent Itemset
 0     True
1    False
2    False
3    False
Name: itemset, dtype: bool

All of [{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {1, 2, 3, 5}] subset contained in 
  itemset  sup
0  {1, 2}    5
1  {1, 3}    5
2  {1, 5}    3
3  {2, 3}    4
Subset from [1, 2, 5] 

In [41]:
candidate3_list

[{1, 2, 3}, {1, 2, 5}, {1, 3, 5}]

# Creating the Third Frequent Itemset (L3)

In [42]:
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [43]:
#Checking the newest candidate value
candidate3_list

[{1, 2, 3}, {1, 2, 5}, {1, 3, 5}]

In [44]:
count_candidate3_df = count_support(df, candidate3_list)

Database D dataframe
           items
0        [1, 2]
1     [1, 2, 3]
2  [1, 2, 3, 4]
3     [1, 2, 5]
4     [1, 3, 5]
5        [4, 5]
6  [1, 2, 3, 5]
7        [1, 3]
8        [1, 4]
9     [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
      itemset  sup
0  {1, 2, 3}    0
1  {1, 2, 5}    0
2  {1, 3, 5}    0


In [45]:
count_candidate3_df

Unnamed: 0,itemset,sup
0,"{1, 2, 3}",3
1,"{1, 2, 5}",2
2,"{1, 3, 5}",2


In [46]:
freq_itemset3 = filter_sup(count_candidate3_df)
freq_itemset3

Unnamed: 0,itemset,sup
0,"{1, 2, 3}",3


All Frequent Itemset

In [47]:
freq_itemset1

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


In [48]:
freq_itemset2

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
3,"{1, 5}",3
4,"{2, 3}",4


In [49]:
freq_itemset3

Unnamed: 0,itemset,sup
0,"{1, 2, 3}",3


In [50]:
frequent_itemset = pd.concat([freq_itemset1, freq_itemset2, freq_itemset3], axis=0)
frequent_itemset

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4
0,"{1, 2}",5
1,"{1, 3}",5
3,"{1, 5}",3
4,"{2, 3}",4
0,"{1, 2, 3}",3


In [51]:
#Reset the index
frequent_itemset_final = frequent_itemset.reset_index(drop=True)

Final Output of Freq. Itemset (L1-L3)

In [52]:
frequent_itemset_final

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4
5,"{1, 2}",5
6,"{1, 3}",5
7,"{1, 5}",3
8,"{2, 3}",4
9,"{1, 2, 3}",3
