In [None]:
# @title 1. ACLOSE algorithm sample transactions - closed frequent patterns
from collections import defaultdict
from itertools import combinations

# Sample transactional dataset
transactions = {
    "T100": ["I1", "I2", "I5"],
    "T200": ["I2", "I4"],
    "T300": ["I2", "I3"],
    "T400": ["I1", "I2", "I4"],
    "T500": ["I1", "I3"],
    "T600": ["I2", "I3"],
    "T700": ["I1", "I3"],
    "T800": ["I1", "I2", "I3", "I5"],
    "T900": ["I1", "I2", "I3"]
}

min_support = 0.1
min_count = int(min_support * len(transactions))

# Generate candidate itemsets of size 1
def get_candidate_1_itemsets(transactions):
    item_counts = defaultdict(int)
    for transaction in transactions.values():
        for item in transaction:
            item_counts[frozenset([item])] += 1
    return {itemset for itemset, count in item_counts.items() if count >= min_count}

# Generate candidate itemsets of size k from frequent (k-1)-itemsets
def generate_candidates(prev_frequent_itemsets, k):
    candidates = set()
    prev_itemsets = list(prev_frequent_itemsets)
    for i in range(len(prev_itemsets)):
        for j in range(i + 1, len(prev_itemsets)):
            candidate = prev_itemsets[i] | prev_itemsets[j]
            if len(candidate) == k:
                subsets = list(combinations(candidate, k - 1))
                if all(frozenset(subset) in prev_frequent_itemsets for subset in subsets):
                    candidates.add(candidate)
    return candidates

# Calculate support count for each candidate itemset
def calculate_support(transactions, candidates):
    support_counts = defaultdict(int)
    for transaction in transactions.values():
        transaction_set = frozenset(transaction)
        for candidate in candidates:
            if candidate.issubset(transaction_set):
                support_counts[candidate] += 1
    return {itemset: count for itemset, count in support_counts.items() if count >= min_count}

# ALCose algorithm to find closed frequent itemsets
def alcose_algorithm(transactions, min_support):
    k = 1
    frequent_itemsets = []

    # Initial pass for 1-itemsets
    current_frequent_itemsets = get_candidate_1_itemsets(transactions)

    while current_frequent_itemsets:
        # Add frequent itemsets of size k
        frequent_itemsets.extend(current_frequent_itemsets)

        # Generate candidate itemsets of size (k+1)
        candidates = generate_candidates(current_frequent_itemsets, k + 1)
        current_frequent_itemsets = calculate_support(transactions, candidates)

        # Filter closed itemsets
        closed_itemsets = {}
        for itemset, count in current_frequent_itemsets.items():
            is_closed = all(count > support for other_itemset, support in current_frequent_itemsets.items()
                            if itemset != other_itemset and itemset.issubset(other_itemset))
            if is_closed:
                closed_itemsets[itemset] = count

        current_frequent_itemsets = set(closed_itemsets.keys())
        k += 1

    return frequent_itemsets

# Run ALCose algorithm
frequent_closed_itemsets = alcose_algorithm(transactions, min_support)

# Output the results
print("Frequent Closed Itemsets with min_support =", min_support)
for itemset in frequent_closed_itemsets:
    print(set(itemset))


Frequent Closed Itemsets with min_support = 0.1
{'I4'}
{'I2'}
{'I1'}
{'I3'}
{'I5'}
{'I3', 'I1'}
{'I2', 'I4'}
{'I1', 'I2'}
{'I1', 'I4'}
{'I3', 'I5'}
{'I2', 'I5'}
{'I1', 'I5'}
{'I3', 'I2'}
{'I1', 'I2', 'I4'}
{'I3', 'I2', 'I1'}
{'I1', 'I2', 'I5'}
{'I3', 'I1', 'I5'}
{'I3', 'I2', 'I5'}
{'I3', 'I1', 'I2', 'I5'}


In [None]:
# @title 1. ACLOSE algorithm on mushroom dataset - closed frequent patterns
from collections import defaultdict
from itertools import combinations

# Load the mushroom dataset
def load_data(filename):
    dataset = []
    with open(filename, 'r') as file:
        for line in file:
            transaction = frozenset(map(int, line.strip().split()))
            dataset.append(transaction)
    return dataset

# Calculate support for each itemset
def calculate_support(dataset, itemset):
    return sum(1 for transaction in dataset if itemset.issubset(transaction))

# Find all frequent itemsets
def find_frequent_itemsets(dataset, min_support):
    items = {item for transaction in dataset for item in transaction}
    num_transactions = len(dataset)
    min_support_count = int(num_transactions * min_support)  # Convert to absolute count
    L = []  # List to store all levels of frequent itemsets
    L1 = {frozenset([item]): calculate_support(dataset, frozenset([item])) for item in items}
    L1 = {k: v for k, v in L1.items() if v >= min_support_count}  # Filter by min_support count
    L.append(L1)

    k = 2
    while L[k - 2]:
        Ck = {}
        frequent_itemsets_k_1 = list(L[k - 2].keys())
        for i in range(len(frequent_itemsets_k_1)):
            for j in range(i + 1, len(frequent_itemsets_k_1)):
                candidate = frequent_itemsets_k_1[i] | frequent_itemsets_k_1[j]
                if len(candidate) == k and all((candidate - frozenset([item]) in frequent_itemsets_k_1) for item in candidate):
                    support = calculate_support(dataset, candidate)
                    if support >= min_support_count:
                        Ck[candidate] = support
        L.append(Ck)
        k += 1

    return L

# Extract closed itemsets
def extract_closed_itemsets(L):
    closed_itemsets = {}
    for level in L:
        for itemset, support in level.items():
            is_closed = True
            for higher_level in L:
                for superset, superset_support in higher_level.items():
                    if itemset < superset and support == superset_support:
                        is_closed = False
                        break
                if not is_closed:
                    break
            if is_closed:
                closed_itemsets[itemset] = support
    return closed_itemsets

# Main function to run A-Close algorithm
def aclose_algorithm(filename, min_support):
    dataset = load_data(filename)
    frequent_itemsets = find_frequent_itemsets(dataset, min_support)
    closed_itemsets = extract_closed_itemsets(frequent_itemsets)
    return closed_itemsets

# Set parameters
filename = '/content/drive/MyDrive/mushroom_dataset/mushroom.dat'
min_support = 0.6
# Run the algorithm and display results
closed_patterns = aclose_algorithm(filename, min_support)
print("Closed Frequent Patterns and their Supports:")
for pattern, support in closed_patterns.items():
    print(f"Pattern: {set(pattern)}, Support: {support}")


Closed Frequent Patterns and their Supports:
Pattern: {85}, Support: 8124
Pattern: {34, 85}, Support: 7914
Pattern: {36, 85}, Support: 6812
Pattern: {85, 39}, Support: 5612
Pattern: {59, 85}, Support: 5176
Pattern: {85, 63}, Support: 4936
Pattern: {85, 86}, Support: 7924
Pattern: {90, 85}, Support: 7488
Pattern: {34, 85, 86}, Support: 7906
Pattern: {34, 90, 85}, Support: 7296
Pattern: {36, 85, 86}, Support: 6620
Pattern: {90, 36, 85}, Support: 6464
Pattern: {85, 86, 39}, Support: 5420
Pattern: {90, 85, 39}, Support: 4976
Pattern: {34, 36, 85, 86}, Support: 6602
Pattern: {34, 85, 86, 39}, Support: 5402
Pattern: {34, 59, 85, 86}, Support: 4984
Pattern: {34, 90, 85, 86}, Support: 7288
Pattern: {34, 36, 85, 86, 90}, Support: 6272


In [None]:
# @title 2. Pincer Search on sample transaction - maximal frequent patterns
from collections import defaultdict
from itertools import combinations

# Sample transactional dataset
transactions = {
    "T100": ["I1", "I2", "I5"],
    "T200": ["I2", "I4"],
    "T300": ["I2", "I3"],
    "T400": ["I1", "I2", "I4"],
    "T500": ["I1", "I3"],
    "T600": ["I2", "I3"],
    "T700": ["I1", "I3"],
    "T800": ["I1", "I2", "I3", "I5"],
    "T900": ["I1", "I2", "I3"]
}

min_support = 0.1
min_count = int(min_support * len(transactions))

# Calculate support count for an itemset
def calculate_support(transactions, itemset):
    count = 0
    for transaction in transactions.values():
        if itemset.issubset(transaction):
            count += 1
    return count

# Generate candidate itemsets of size k
def generate_candidates(frequent_itemsets, k):
    candidates = set()
    itemsets_list = list(frequent_itemsets)
    for i in range(len(itemsets_list)):
        for j in range(i + 1, len(itemsets_list)):
            candidate = itemsets_list[i] | itemsets_list[j]
            if len(candidate) == k:
                subsets = list(combinations(candidate, k - 1))
                if all(frozenset(subset) in frequent_itemsets for subset in subsets):
                    candidates.add(candidate)
    return candidates

# Pincer-Search Algorithm
def pincer_search(transactions, min_count):
    max_frequent_patterns = []  # Store maximal frequent itemsets
    frequent_itemsets = set()   # Store discovered frequent itemsets
    infrequent_itemsets = set() # Store discovered infrequent itemsets
    k = 1

    # Initialize frequent 1-itemsets
    candidate_1_itemsets = {frozenset([item]) for transaction in transactions.values() for item in transaction}
    current_frequent_itemsets = {itemset for itemset in candidate_1_itemsets if calculate_support(transactions, itemset) >= min_count}

    # Bidirectional search
    while current_frequent_itemsets:
        # Add current frequent itemsets to the global set of frequent itemsets
        frequent_itemsets.update(current_frequent_itemsets)

        # Generate candidates of size k+1
        k += 1
        candidate_itemsets = generate_candidates(current_frequent_itemsets, k)

        # Filter candidates to find frequent and infrequent itemsets
        current_frequent_itemsets = set()
        for itemset in candidate_itemsets:
            support = calculate_support(transactions, itemset)
            if support >= min_count:
                current_frequent_itemsets.add(itemset)
            else:
                infrequent_itemsets.add(itemset)

        # Prune candidates with infrequent supersets (top-down pruning)
        maximal_frequent = {itemset for itemset in current_frequent_itemsets
                            if not any(superset.issubset(itemset) for superset in infrequent_itemsets)}

        # Add maximal frequent patterns to results
        max_frequent_patterns.extend(maximal_frequent)

    return max_frequent_patterns

# Run Pincer-Search algorithm
maximal_frequent_patterns = pincer_search(transactions, min_count)

# Output the results
print("Maximal Frequent Patterns with min_support =", min_support)
for pattern in maximal_frequent_patterns:
    print(set(pattern))



Maximal Frequent Patterns with min_support = 0.1
{'I5', 'I4'}
{'I3', 'I1'}
{'I3', 'I4'}
{'I2', 'I4'}
{'I2', 'I1'}
{'I1', 'I4'}
{'I3', 'I5'}
{'I2', 'I5'}
{'I1', 'I5'}
{'I3', 'I2'}
{'I3', 'I1', 'I4'}
{'I2', 'I1', 'I4'}
{'I1', 'I5', 'I4'}
{'I3', 'I2', 'I4'}
{'I3', 'I2', 'I1'}
{'I2', 'I1', 'I5'}
{'I3', 'I1', 'I5'}
{'I3', 'I5', 'I4'}
{'I2', 'I5', 'I4'}
{'I3', 'I2', 'I5'}
{'I3', 'I2', 'I5', 'I4'}
{'I3', 'I1', 'I2', 'I5'}
{'I1', 'I2', 'I5', 'I4'}
{'I3', 'I2', 'I1', 'I4'}
{'I3', 'I1', 'I5', 'I4'}
{'I2', 'I1', 'I4', 'I3', 'I5'}


In [None]:
# @title 2. Pincer Search on mushroom dataset - maximal frequent patterns
from collections import defaultdict
from itertools import combinations

# Load the mushroom dataset
def load_data(filename):
    dataset = []
    with open(filename, 'r') as file:
        for line in file:
            transaction = frozenset(map(int, line.strip().split()))
            dataset.append(transaction)
    return dataset

# Calculate support for each itemset
def calculate_support(dataset, itemset):
    return sum(1 for transaction in dataset if itemset.issubset(transaction))

# Generate candidate itemsets by extending frequent itemsets
def generate_candidates(frequent_itemsets, k):
    candidates = set()
    frequent_itemsets = list(frequent_itemsets)
    for i in range(len(frequent_itemsets)):
        for j in range(i + 1, len(frequent_itemsets)):
            candidate = frequent_itemsets[i] | frequent_itemsets[j]
            if len(candidate) == k:
                candidates.add(candidate)
    return candidates

# Perform the Pincer-Search algorithm to find maximal patterns
def pincer_search_maximal(dataset, min_support_fraction):
    num_transactions = len(dataset)
    min_support_count = int(num_transactions * min_support_fraction)

    # Initial frequent 1-itemsets
    item_counts = defaultdict(int)
    for transaction in dataset:
        for item in transaction:
            item_counts[frozenset([item])] += 1

    frequent_itemsets = {itemset for itemset, count in item_counts.items() if count >= min_support_count}
    maximal_itemsets = set()

    # Main Pincer-Search loop
    k = 2
    while frequent_itemsets:
        # Generate candidate itemsets from frequent itemsets
        candidates = generate_candidates(frequent_itemsets, k)

        # Calculate support and prune non-frequent itemsets
        frequent_itemsets = set()
        for candidate in candidates:
            support = calculate_support(dataset, candidate)
            if support >= min_support_count:
                frequent_itemsets.add(candidate)

                # Check for maximal property
                is_maximal = True
                for superset in maximal_itemsets:
                    if candidate < superset:
                        is_maximal = False
                        break
                if is_maximal:
                    maximal_itemsets.add(candidate)

        # Remove subsets of newly found frequent itemsets from maximal itemsets
        for itemset in list(maximal_itemsets):
            if any(itemset < frequent for frequent in frequent_itemsets):
                maximal_itemsets.discard(itemset)

        k += 1

    # Return maximal frequent itemsets with their support counts
    maximal_itemsets_with_support = {itemset: calculate_support(dataset, itemset) for itemset in maximal_itemsets}
    return maximal_itemsets_with_support

# Set parameters
filename = '/content/drive/MyDrive/mushroom_dataset/mushroom.dat'
min_support_fraction = 0.6

# Run the algorithm and display results
dataset = load_data(filename)
maximal_patterns = pincer_search_maximal(dataset, min_support_fraction)
print("Maximal Frequent Patterns and their Supports:")
for pattern, support in maximal_patterns.items():
    print(f"Pattern: {set(pattern)}, Support: {support}")


Maximal Frequent Patterns and their Supports:
Pattern: {34, 59, 85, 86}, Support: 4984
Pattern: {85, 63}, Support: 4936
Pattern: {34, 85, 86, 39}, Support: 5402
Pattern: {90, 85, 39}, Support: 4976
Pattern: {34, 36, 85, 86, 90}, Support: 6272


In [None]:
# @title 3. Decision tree model - class example
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# Sample dataset based on the provided table
data = {
    'age': ['<=30', '<=30', '31…40', '>40', '>40', '>40', '31…40', '<=30', '<=30', '>40', '<=30', '31…40', '31…40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

# Convert data into a DataFrame
df = pd.DataFrame(data)

# Function to predict based on the decision tree logic
def decision_tree(row):
    # Level 1: Age
    if row['age'] == '<=30':
        # Check if student
        if row['student'] == 'yes':
            return 'yes'
        else:
            return 'no'
    elif row['age'] == '31…40':
        return 'yes'
    elif row['age'] == '>40':
        # Check credit rating
        if row['credit_rating'] == 'excellent':
            return 'no'
        else:  # credit_rating == 'fair'
            return 'yes'
    return None

# Apply the decision tree function to each row
df['prediction'] = df.apply(decision_tree, axis=1)

# Print the DataFrame with predictions
print(df[['age', 'student', 'credit_rating', 'buys_computer', 'prediction']])

# Calculate accuracy
accuracy = (df['buys_computer'] == df['prediction']).mean() * 100
print(f"\nAccuracy: {accuracy:.2f}%")

# Generate confusion matrix
conf_matrix = confusion_matrix(df['buys_computer'], df['prediction'], labels=['yes', 'no'])
conf_df = pd.DataFrame(conf_matrix, index=['Actual Yes', 'Actual No'], columns=['Predicted Yes', 'Predicted No'])

print("\nConfusion Matrix:")
print(conf_df)

# Print classification report
print("\nClassification Report:")
print(classification_report(df['buys_computer'], df['prediction'], target_names=['No', 'Yes']))


      age student credit_rating buys_computer prediction
0    <=30      no          fair            no         no
1    <=30      no     excellent            no         no
2   31…40      no          fair           yes        yes
3     >40      no          fair           yes        yes
4     >40     yes          fair           yes        yes
5     >40     yes     excellent            no         no
6   31…40     yes     excellent           yes        yes
7    <=30      no          fair            no         no
8    <=30     yes          fair           yes        yes
9     >40     yes          fair           yes        yes
10   <=30     yes     excellent           yes        yes
11  31…40      no     excellent           yes        yes
12  31…40     yes          fair           yes        yes
13    >40      no     excellent            no         no

Accuracy: 100.00%

Confusion Matrix:
            Predicted Yes  Predicted No
Actual Yes              9             0
Actual No               0  

In [None]:
# @title 3. Decision tree model - breast cancer cell classification
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 94.15204678362574
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92        63
           1       0.97      0.94      0.95       108

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.94       171
weighted avg       0.94      0.94      0.94       171

Confusion Matrix:
 [[ 60   3]
 [  7 101]]
