In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prettytable import PrettyTable
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax

In [None]:
groceries = pd.read_csv("../input/groceries-dataset/Groceries_dataset.csv")

In [None]:
groceries.head()

## Brute force

In [None]:
groceries.groupby(['Member_number', 'Date'], as_index=False).agg({'itemDescription': list})

In [None]:
groceries['itemDescription'].unique().shape

In [None]:
all_transactions = [transaction[1]['itemDescription'].tolist() for transaction in list(groceries.groupby(['Member_number', 'Date']))]

In [None]:
len(all_transactions)

In [None]:
all_transactions[0:10]

In [None]:
trans_encoder = TransactionEncoder() # Instanciate the encoder
trans_encoder_matrix = trans_encoder.fit(all_transactions).transform(all_transactions)
trans_encoder_matrix = pd.DataFrame(trans_encoder_matrix, columns=trans_encoder.columns_)

In [None]:
trans_encoder_matrix.head()

In [None]:
rule_items = apriori(trans_encoder_matrix, min_support=0.0001, use_colnames=True, max_len=2)

rules = association_rules(rule_items, metric="lift", min_threshold=1)

rules.sort_values('lift', ascending=False)

In [None]:
rule_items

In [None]:
def perform_rule_calculation(transact_items_matrix, rule_type="fpgrowth", min_support=0.001):
    """
    desc: this function performs the association rule calculation 
    @params:
        - transact_items_matrix: the transaction X Items matrix
        - rule_type: 
                    - apriori or Growth algorithms (default="fpgrowth")
                    
        - min_support: minimum support threshold value (default = 0.001)
        
    @returns:
        - the matrix containing 3 columns:
            - support: support values for each combination of items
            - itemsets: the combination of items
            - number_of_items: the number of items in each combination of items
            
        - the excution time for the corresponding algorithm
        
    """
    start_time = 0
    total_execution = 0
    
    if(not rule_type=="fpgrowth"):
        start_time = time.time()
        rule_items = apriori(transact_items_matrix, 
                       min_support=min_support, 
                       use_colnames=True)
        total_execution = time.time() - start_time
        print("Computed Apriori!")
        
    else:
        start_time = time.time()
        rule_items = fpgrowth(transact_items_matrix, 
                       min_support=min_support, 
                       use_colnames=True)
        total_execution = time.time() - start_time
        print("Computed Fp Growth!")
    
    rule_items['number_of_items'] = rule_items['itemsets'].apply(lambda x: len(x))
    
    return rule_items, total_execution

def compute_association_rule(rule_matrix, metric="lift", min_thresh=1):
    """
    @desc: Compute the final association rule
    @params:
        - rule_matrix: the corresponding algorithms matrix
        - metric: the metric to be used (default is lift)
        - min_thresh: the minimum threshold (default is 1)
        
    @returns:
        - rules: all the information for each transaction satisfying the given metric & threshold
    """
    rules = association_rules(rule_matrix, 
                              metric=metric, 
                              min_threshold=min_thresh)
    
    return rules

def plot_metrics_relationship(rule_matrix, col1, col2):
    """
    desc: shows the relationship between the two input columns 
    @params:
        - rule_matrix: the matrix containing the result of a rule (apriori or Fp Growth)
        - col1: first column
        - col2: second column
    """
    fit = np.polyfit(rule_matrix[col1], rule_matrix[col2], 1)
    fit_funt = np.poly1d(fit)
    plt.plot(rule_matrix[col1], rule_matrix[col2], 'yo', rule_matrix[col1], 
    fit_funt(rule_matrix[col1]))
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title('{} vs {}'.format(col1, col2))
    
def compare_time_exec(algo1=list, alg2=list):
    """
    @desc: shows the execution time between two algorithms
    @params:
        - algo1: list containing the description of first algorithm, where
            
        - algo2: list containing the description of second algorithm, where
    """
    
    execution_times = [algo1[1], algo2[1]]
    algo_names = (algo1[0], algo2[0])
    y=np.arange(len(algo_names))
    
    plt.bar(y,execution_times,color=['orange', 'blue'])
    plt.xticks(y,algo_names)
    plt.xlabel('Algorithms')
    plt.ylabel('Time')
    plt.title("Execution Time (seconds) Comparison")
    plt.show()

## Fp Growph Algorithm

In [None]:
fpgrowth_matrix, fp_growth_exec_time = perform_rule_calculation(trans_encoder_matrix)
print("Fp Growth execution took: {} seconds".format(fp_growth_exec_time))

In [None]:
fpgrowth_matrix.head()

In [None]:
fp_growth_rule_lift = compute_association_rule(fpgrowth_matrix)

In [None]:
fp_growth_rule_lift.head()

In [None]:
plot_metrics_relationship(fp_growth_rule_lift, col1='lift', col2='confidence')

In [None]:
fp_growth_rule = compute_association_rule(fpgrowth_matrix, metric="confidence", min_thresh=0.2)
fp_growth_rule.head()

## Apriori

In [None]:
apriori_matrix, apriori_exec_time = perform_rule_calculation(trans_encoder_matrix, rule_type="apriori")
print("Apriori Execution took: {} seconds".format(apriori_exec_time))

In [None]:
apriori_matrix.head()

In [None]:
apriori_rule_lift = compute_association_rule(apriori_matrix)
apriori_rule_lift

In [None]:
apriori_rule_lift['confidence'] == (apriori_rule_lift['support'] / apriori_rule_lift['antecedent support'])

In [None]:
plot_metrics_relationship(apriori_rule_lift, col1='lift', col2='confidence')

In [None]:
plot_metrics_relationship(apriori_rule_lift, col1='lift', col2='conviction')

In [None]:
apripri_rule = compute_association_rule(apriori_matrix, metric="confidence", min_thresh=0.2)
apripri_rule.head()

In [None]:
fp_growth_rule_lift.sort_values('lift')

In [None]:
apriori_rule_lift.sort_values('lift')

In [None]:
apriori_rule_lift['antecedents']#.str.replace("(", "", regex=True)#split("")

## ECLAT

In [None]:
!pip install pyECLAT

In [None]:
operations = groceries.groupby(['Member_number', 'Date'], as_index=False).agg({'itemDescription': lambda x: "; ".join(list(x))})['itemDescription'].str.split('; ',expand=True)
operations

In [None]:
from pyECLAT import ECLAT

eclat_instance = ECLAT(data=operations, verbose=True)

In [None]:
#eclat_instance

In [None]:
get_ECLAT_indexes, get_ECLAT_supports = eclat_instance.fit(min_support=0.001,
                                                           min_combination=1,
                                                           max_combination=1,
                                                           separator=', ',
                                                           verbose=True)

In [None]:
[f"({i})" for i in list(get_ECLAT_supports.keys())]

In [None]:
sup = pd.DataFrame({
'support': list(get_ECLAT_supports.values()),
'itemsets': [f"({i})" for i in list(get_ECLAT_supports.keys())]
})

rules = association_rules(sup, metric="lift", min_threshold=1, support_only=True)

rules

In [None]:
sup

In [None]:
eclat_instance

## CBA

In [None]:
## ARC

In [None]:
!pip install pyarc

In [None]:
from pyarc import CBA
from pyarc.data_structures import TransactionDB
from pyarc.qcba.data_structures import QuantitativeDataFrame
import pandas as pd
from pyarc.qcba import QCBA

In [None]:
from pyarc.qcba.data_structures import (
    IntervalReader,
    Interval,
    QuantitativeDataFrame,
    QuantitativeCAR
)

In [None]:
interval_reader = IntervalReader()

interval_reader.closed_bracket = "", "NULL"
interval_reader.open_bracket = "NULL", ""
interval_reader.infinity_symbol = "inf", "inf"
interval_reader.members_separator = "_to_"

interval_reader.compile_reader()

QuantitativeCAR.interval_reader = interval_reader

In [None]:
estate = pd.read_csv("../input/kyiv-real-estate/class_flat.csv").select_dtypes(exclude=['int', 'float'])
estate = estate.drop(columns = ['description', 'city', 'address'])
estate['bad_proposal'] = pd.read_csv("../input/kyiv-real-estate/class_flat.csv")['bad_proposal']
estate.head()

In [None]:
txns_train = TransactionDB.from_DataFrame(estate)
txns_train[0]

In [None]:
CBA(support=0.01, confidence=0.01, algorithm="m2")
cba.fit(txns_train)
cba.rule_model_accuracy(txns_train)

In [None]:
cba.clf.rules

In [None]:
set(cba.predict_probability(txns_train))

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(estate['bad_proposal'], 1-np.array(cba.predict_probability(txns_train)))

In [None]:
print("CBA accuracy:", cba.rule_model_accuracy(txns_train))

## cspade

In [None]:
!pip install Cython pycspade

In [None]:
from pycspade.helpers import spade, print_result

In [None]:
positions_dict = {i:j for j, i in enumerate(groceries['itemDescription'].unique())}

In [None]:
all_transactions_num = [[positions_dict[j] for j in i] for i in all_transactions]

In [None]:
with open("trans.txt", "w") as f:
    f.write("\n".join([" ".join(str(i)) for i in all_transactions_num]))

In [None]:
data = [[j,j, i] for j, i in enumerate(all_transactions_num)]

result = spade(data=data, support=0.001)
print_result(result)