In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

In [6]:


# Example corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus
count_matrix = vectorizer.fit_transform(corpus)

# Convert the count_matrix to csr_matrix
csr_matrix = csr_matrix(count_matrix)

# Print the csr_matrix
print(csr_matrix.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [8]:
class Apriori:
    def __init__(self, transactions, min_support, min_confidence):
        self.transactions = transactions
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.support_data = {}

    def generate_C1(self):
        C1 = []
        for transaction in self.transactions:
            for item in transaction:
                if [item] not in C1:
                    C1.append([item])
        C1.sort()
        return list(map(frozenset, C1))

    def generate_Lk_from_Ck(self, Ck):
        item_count = {}
        for transaction in self.transactions:
            for itemset in Ck:
                if itemset.issubset(transaction):
                    item_count[itemset] = item_count.get(itemset, 0) + 1

        Lk = []
        transaction_count = len(self.transactions)
        for itemset, count in item_count.items():
            support = count / transaction_count
            if support >= self.min_support:
                Lk.append(itemset)
                self.support_data[itemset] = support

        return Lk

    def generate_Ck(self, Lk_1, k):
        Ck = []
        len_Lk_1 = len(Lk_1)
        for i in range(len_Lk_1):
            for j in range(i + 1, len_Lk_1):
                itemset1 = list(Lk_1[i])
                itemset2 = list(Lk_1[j])
                itemset1.sort()
                itemset2.sort()

                # Check if the first k-2 items are the same
                if itemset1[:k-2] == itemset2[:k-2]:
                    # Create a new candidate itemset by merging itemset1 and itemset2
                    new_itemset = frozenset(itemset1 + [itemset2[-1]])
                    Ck.append(new_itemset)
        return Ck

    def apriori(self):
        C1 = self.generate_C1()
        L1 = self.generate_Lk_from_Ck(C1)
        k = 2
        frequent_itemsets = [L1]
        while len(frequent_itemsets[k-2]) > 0:
            Ck = self.generate_Ck(frequent_itemsets[k-2], k)
            Lk = self.generate_Lk_from_Ck(Ck)
            frequent_itemsets.append(Lk)
            k += 1

        return frequent_itemsets


# Example usage
lst_trans = [
    [1, 3, 4],
    [2, 3, 5],
    [1, 2, 3, 5],
    [2, 5]
]



In [9]:
# Example usage
lst_trans = [
    [1, 3, 4],
    [2, 3, 5],
    [1, 2, 3, 5],
    [2, 5]
]

model = Apriori(transactions=lst_trans, min_support=0.3, min_confidence=0.7)
frequent_itemsets = model.apriori()

for i, Lk in enumerate(frequent_itemsets):
    print("L{}: {}".format(i+1, Lk))

L1: [frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})]
L2: [frozenset({1, 3}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
L3: [frozenset({2, 3, 5})]
L4: []
