<a href="https://colab.research.google.com/github/sp2743/datamining-samplequestion/blob/main/Frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Task 1: Implement the Apriori algorithm (mlxtend library) to find closed frequent patterns in a transaction dataset.

    1.1: Load a dataset of transactions. [For Demo: consider Table 6.1 dataset from the book]
    1.2: Apply the Apriori algorithm to find frequent itemsets with a minimum support threshold.  [10]
    1.3: Identify closed frequent patterns (an itemset is closed if no superset has the same support). [30]    

In [10]:
import pandas as pd
import numpy as np

In [11]:
D={'T100': ['I1','I2','I5'],
   'T200': ['I2','I4'],
   'T300': ['I2','I3'],
   'T400': ['I1','I2','I4'],
   'T500': ['I1','I3'],
   'T600': ['I2','I3'],
   'T700': ['I1','I3'],
   'T800': ['I1','I2','I3','I5'],
   'T900': ['I1','I2','I3']
   }

df = pd.DataFrame({'TransactionID': list(D.keys()), 'Items': list(D.values())})

print(df)

  TransactionID             Items
0          T100      [I1, I2, I5]
1          T200          [I2, I4]
2          T300          [I2, I3]
3          T400      [I1, I2, I4]
4          T500          [I1, I3]
5          T600          [I2, I3]
6          T700          [I1, I3]
7          T800  [I1, I2, I3, I5]
8          T900      [I1, I2, I3]


In [15]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

te = TransactionEncoder()
te_ary = te.fit(df['Items']).transform(df['Items'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
df_encoded

Unnamed: 0,I1,I2,I3,I4,I5
0,True,True,False,False,True
1,False,True,False,True,False
2,False,True,True,False,False
3,True,True,False,True,False
4,True,False,True,False,False
5,False,True,True,False,False
6,True,False,True,False,False
7,True,True,True,False,True
8,True,True,True,False,False


In [18]:
frequent_itemsets=apriori(df_encoded,min_support=0.3, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(I1)
1,0.777778,(I2)
2,0.666667,(I3)
3,0.444444,"(I2, I1)"
4,0.444444,"(I3, I1)"
5,0.444444,"(I2, I3)"


In [19]:
res=[]
for i in frequent_itemsets.itemsets:
    sup=frequent_itemsets[frequent_itemsets.itemsets==i].support.values[0]
    for j in frequent_itemsets.itemsets:
        if i.issubset(j) and i!=j:
            if sup==frequent_itemsets[frequent_itemsets.itemsets==j].support.values[0]:
                res.append(i)
                break
closed_itemsets=frequent_itemsets[~frequent_itemsets.itemsets.isin(res)]
closed_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(I1)
1,0.777778,(I2)
2,0.666667,(I3)
3,0.444444,"(I2, I1)"
4,0.444444,"(I3, I1)"
5,0.444444,"(I2, I3)"


### Task 2: Implement the FP-Growth algorithm (mlxtend library) to find maximal frequent patterns in a dataset.

    2.1: Load a dataset of transactions. [For Demo: consider Table 6.1 dataset from the book]
    2.2: Apply the FP-Growth algorithm to find frequent itemsets. [10]
    2.3: Identify maximal frequent patterns (an itemset is maximal if no superset is frequent). [30]

In [20]:
print(df)

  TransactionID             Items
0          T100      [I1, I2, I5]
1          T200          [I2, I4]
2          T300          [I2, I3]
3          T400      [I1, I2, I4]
4          T500          [I1, I3]
5          T600          [I2, I3]
6          T700          [I1, I3]
7          T800  [I1, I2, I3, I5]
8          T900      [I1, I2, I3]


In [21]:
from mlxtend.frequent_patterns import fpgrowth
frequent_itemsets=fpgrowth(df_encoded,min_support=0.3, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.777778,(I2)
1,0.666667,(I1)
2,0.666667,(I3)
3,0.444444,"(I2, I1)"
4,0.444444,"(I3, I1)"
5,0.444444,"(I2, I3)"


In [23]:
itemsets_dict = {frozenset(itemset): support for itemset, support in zip(frequent_itemsets['itemsets'], frequent_itemsets['support'])}
maximal_itemsets = []
for itemset in itemsets_dict.keys():
    is_maximal = True
    for superset in itemsets_dict.keys():
        if itemset < superset:
            is_maximal = False
            break
    if is_maximal:
        maximal_itemsets.append(itemset)

print("Maximal Frequent Itemsets:")
for itemset in maximal_itemsets:
    print(itemset, "-", itemsets_dict[itemset])


Maximal Frequent Itemsets:
frozenset({'I2', 'I1'}) - 0.4444444444444444
frozenset({'I3', 'I1'}) - 0.4444444444444444
frozenset({'I2', 'I3'}) - 0.4444444444444444


### Task3: Write a Program to convert dataset from horizontal format to vertical format and apply naive algorithm (intersection) to find frequent patterns.

    3.1: Load a dataset of transactions. [For Demo: consider Table 6.1 dataset from the book]
    3.2: Convert horizontal to vertical format. [10]
    3.3: Apply simple intersection based algorithm to find frequent patterns. [10]

In [24]:
print(df)

  TransactionID             Items
0          T100      [I1, I2, I5]
1          T200          [I2, I4]
2          T300          [I2, I3]
3          T400      [I1, I2, I4]
4          T500          [I1, I3]
5          T600          [I2, I3]
6          T700          [I1, I3]
7          T800  [I1, I2, I3, I5]
8          T900      [I1, I2, I3]


In [30]:
vertical_format={}
for i in D:
    for j in D[i]:
        if j not in vertical_format:
            vertical_format[j] = []
        vertical_format[j].append(i)
vertical_format

{'I1': ['T100', 'T400', 'T500', 'T700', 'T800', 'T900'],
 'I2': ['T100', 'T200', 'T300', 'T400', 'T600', 'T800', 'T900'],
 'I5': ['T100', 'T800'],
 'I4': ['T200', 'T400'],
 'I3': ['T300', 'T500', 'T600', 'T700', 'T800', 'T900']}

In [31]:
from itertools import combinations
frequent_item = {}
min_support = 3
for item1, item2 in combinations(vertical_format.keys(), 2):
    common_transactions = set(vertical_format[item1]).intersection(set(vertical_format[item2]))
    if len(common_transactions) >= min_support:
        frequent_item[(item1, item2)] = len(common_transactions)

# Print the frequent itemsets
print("Frequent 2-itemsets:", frequent_item)



Frequent 2-itemsets: {('I1', 'I2'): 4, ('I1', 'I3'): 4, ('I2', 'I3'): 4}
