In [2]:
import pandas as pd
import numpy as np
import time

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Part 1

In [3]:
DATASETS = ['Iris', 'Breast Cancer', 'Wine', 'Heart Failure Clinical Records',
            'Zoo', 'Adult', 'Car Evaluation', 'Bank Marketing']

IRIS_ATTRIBUTES = []
with open('datasets/Iris/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            IRIS_ATTRIBUTES.append(i.strip())

IRIS_CONFIG = {
    'min_support': 0.1,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

BREAST_CANCER_ATTRIBUTES = []
with open('datasets/Breast Cancer/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            BREAST_CANCER_ATTRIBUTES.append(i.strip())

BREAST_CANCER_CONFIG = {
    'min_support': 0.1,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

WINE_ATTRIBUTES = []
with open('datasets/Wine/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            WINE_ATTRIBUTES.append(i.strip())

WINE_CONFIG = {
    'min_support': 0.08,
    'metric': 'confidence',
    'min_threshold': 0.7,
}

HEART_FAILURE_ATTRIBUTES = []
with open('datasets/Heart Failure Clinical Records/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            HEART_FAILURE_ATTRIBUTES.append(i.strip())

HEART_FAILURE_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

ZOO_ATTRIBUTES = []
with open('datasets/Zoo/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            ZOO_ATTRIBUTES.append(i.strip())

ZOO_CONFIG = {
    'min_support': 0.01,
    'metric': 'confidence',
    'min_threshold': 0.8,
}

ADULT_ATTRIBUTES = []
with open('datasets/Adult/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            ADULT_ATTRIBUTES.append(i.strip())

ADULT_CONFIG = {
    'min_support': 0.01,
    'metric': 'confidence',
    'min_threshold': 0.3,
}

CAR_ATTRIBUTES = []
with open('datasets/Car Evaluation/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            CAR_ATTRIBUTES.append(i.strip())

CAR_CONFIG = {
    'min_support': 0.01,
    'metric': 'confidence',
    'min_threshold': 0.3,
}

BANK_ATTRIBUTES = []
with open('datasets/Bank Marketing/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            BANK_ATTRIBUTES.append(i.strip())

BANK_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

DATASET_ATTRIBUTES = [IRIS_ATTRIBUTES, BREAST_CANCER_ATTRIBUTES, WINE_ATTRIBUTES,
                      HEART_FAILURE_ATTRIBUTES, ZOO_ATTRIBUTES, ADULT_ATTRIBUTES, CAR_ATTRIBUTES, BANK_ATTRIBUTES]
CONFIGS = [IRIS_CONFIG, BREAST_CANCER_CONFIG, WINE_CONFIG,
           HEART_FAILURE_CONFIG, ZOO_CONFIG, ADULT_CONFIG, CAR_CONFIG, BANK_CONFIG]

SELECT_DATASET_INDEX = 0

In [4]:
# Read Dataset
raw_data = pd.read_csv('datasets\\' + DATASETS[SELECT_DATASET_INDEX] + '\\data.csv', index_col=False)
raw_data = pd.DataFrame(raw_data)
raw_data.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# Split attributes and class
dataset = raw_data[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX]]

In [6]:
# Apply discretization on continuous values
def find_bin_thresholds(original_dataset, splits=10):
  bin_thresh = {}

  for col in original_dataset.columns:
    if original_dataset[col].dtype in ['int64', 'float64']:
      bin_thresh_unfiltered_list = []
      all_row_values = np.array(original_dataset[col].tolist())
      for i in range(splits - 1):
        bin_thresh_unfiltered_list.append(
            np.percentile(all_row_values, (i + 1) * 10))
      bin_thresh[col] = np.unique(bin_thresh_unfiltered_list)

  return bin_thresh

def apply_discretization(original_dataset, bin_thresh=None):
  discretized_dataset = original_dataset.copy()

  if bin_thresh == None:
    bin_thresh = find_bin_thresholds(discretized_dataset) 

  # Put continuous values into bins
  for colname, references in bin_thresh.items():
    col_in_numpy_arr = discretized_dataset[colname].to_numpy()
    col_with_bin_no = np.digitize(col_in_numpy_arr, bins=references)
    col_with_bin_no = np.array(['{}'.format(val)
                              for val in col_with_bin_no])
    discretized_dataset[colname] = col_with_bin_no

  # Add column name to value for later use
  for colname, _ in discretized_dataset.items():
    discretized_dataset[colname] = discretized_dataset[colname].apply(
        lambda x: "{}@@@{}".format(colname, x))

  return bin_thresh, discretized_dataset

bin_thresh, dataset = apply_discretization(dataset)

print(bin_thresh)

dataset.head()

{'sepal length in cm': array([4.8 , 5.  , 5.27, 5.6 , 5.8 , 6.1 , 6.3 , 6.52, 6.9 ]), 'petal length in cm': array([1.4 , 1.5 , 1.7 , 3.9 , 4.35, 4.64, 5.  , 5.32, 5.8 ]), 'petal width in cm': array([0.2 , 0.4 , 1.16, 1.3 , 1.5 , 1.8 , 1.9 , 2.2 ])}


Unnamed: 0,sepal length in cm,petal length in cm,petal width in cm,class
0,sepal length in cm@@@2,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa
1,sepal length in cm@@@1,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa
2,sepal length in cm@@@0,petal length in cm@@@0,petal width in cm@@@1,class@@@Iris-setosa
3,sepal length in cm@@@0,petal length in cm@@@2,petal width in cm@@@1,class@@@Iris-setosa
4,sepal length in cm@@@2,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa


In [7]:
# Get frequent itemsets
start = time.process_time()
dataset_list = dataset.values.tolist()

trans_enc = TransactionEncoder()
trans_enc_ary = trans_enc.fit(dataset_list).transform(dataset_list)
encoded_dataset = pd.DataFrame(trans_enc_ary, columns=trans_enc.columns_)

freq_items = apriori(
    encoded_dataset, min_support=CONFIGS[SELECT_DATASET_INDEX]['min_support'], use_colnames=True)
freq_items
    

Unnamed: 0,support,itemsets
0,0.333333,(class@@@Iris-setosa)
1,0.333333,(class@@@Iris-versicolor)
2,0.333333,(class@@@Iris-virginica)
3,0.14,(petal length in cm@@@2)
4,0.113333,(petal length in cm@@@4)
5,0.1,(petal length in cm@@@5)
6,0.106667,(petal length in cm@@@7)
7,0.106667,(petal length in cm@@@9)
8,0.233333,(petal width in cm@@@1)
9,0.126667,(petal width in cm@@@2)


In [8]:
# Get association rules
rules = association_rules(
    freq_items, metric=CONFIGS[SELECT_DATASET_INDEX]['metric'], min_threshold=CONFIGS[SELECT_DATASET_INDEX]['min_threshold'])
rules = rules.sort_values(
    ['support', 'confidence'], ascending=[False, False])
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(petal width in cm@@@1),(class@@@Iris-setosa),0.233333,0.333333,0.233333,1.0,3.0,0.155556,inf
1,(class@@@Iris-setosa),(petal width in cm@@@1),0.333333,0.233333,0.233333,0.7,3.0,0.155556,2.555556
0,(petal length in cm@@@2),(class@@@Iris-setosa),0.14,0.333333,0.14,1.0,3.0,0.093333,inf
5,(petal width in cm@@@4),(class@@@Iris-versicolor),0.14,0.333333,0.133333,0.952381,2.857143,0.086667,14.0
3,(sepal length in cm@@@2),(class@@@Iris-setosa),0.153333,0.333333,0.126667,0.826087,2.478261,0.075556,3.833333
4,(petal length in cm@@@4),(class@@@Iris-versicolor),0.113333,0.333333,0.113333,1.0,3.0,0.075556,inf
7,(petal width in cm@@@7),(class@@@Iris-virginica),0.113333,0.333333,0.113333,1.0,3.0,0.075556,inf
8,(petal width in cm@@@8),(class@@@Iris-virginica),0.113333,0.333333,0.113333,1.0,3.0,0.075556,inf
6,(petal length in cm@@@9),(class@@@Iris-virginica),0.106667,0.333333,0.106667,1.0,3.0,0.071111,inf
10,(sepal length in cm@@@9),(class@@@Iris-virginica),0.113333,0.333333,0.1,0.882353,2.647059,0.062222,5.666667


In [9]:
# Filter rules to ensure consequent is the pred class
def remove_unnecessary_rules(rule):
    if rule.__len__() == 1:
        return list(rule)[0].split("@@@")[0] == DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]
    else:
        return False

filtered_rules = rules.copy()
filtered_rules = filtered_rules[filtered_rules['consequents'].apply(
    remove_unnecessary_rules) == True]

# Check if every class is covered
unique_classes_in_rules = np.unique(
    filtered_rules['consequents'].apply(lambda x: list(x)[0].split("@@@")[1]))
unique_classes_in_raw_data = np.unique(
    raw_data[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]])

print(unique_classes_in_rules, unique_classes_in_raw_data)

filtered_rules

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica'] ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(petal width in cm@@@1),(class@@@Iris-setosa),0.233333,0.333333,0.233333,1.0,3.0,0.155556,inf
0,(petal length in cm@@@2),(class@@@Iris-setosa),0.14,0.333333,0.14,1.0,3.0,0.093333,inf
5,(petal width in cm@@@4),(class@@@Iris-versicolor),0.14,0.333333,0.133333,0.952381,2.857143,0.086667,14.0
3,(sepal length in cm@@@2),(class@@@Iris-setosa),0.153333,0.333333,0.126667,0.826087,2.478261,0.075556,3.833333
4,(petal length in cm@@@4),(class@@@Iris-versicolor),0.113333,0.333333,0.113333,1.0,3.0,0.075556,inf
7,(petal width in cm@@@7),(class@@@Iris-virginica),0.113333,0.333333,0.113333,1.0,3.0,0.075556,inf
8,(petal width in cm@@@8),(class@@@Iris-virginica),0.113333,0.333333,0.113333,1.0,3.0,0.075556,inf
6,(petal length in cm@@@9),(class@@@Iris-virginica),0.106667,0.333333,0.106667,1.0,3.0,0.071111,inf
10,(sepal length in cm@@@9),(class@@@Iris-virginica),0.113333,0.333333,0.1,0.882353,2.647059,0.062222,5.666667
9,(sepal length in cm@@@7),(class@@@Iris-virginica),0.14,0.333333,0.1,0.714286,2.142857,0.053333,2.333333


In [10]:
# Create classifiers from rules
rule_dict = []

index = 0
for _, rule in filtered_rules.iterrows():
    antecedents = list(rule['antecedents'])
    antecedents_dict = {}
    for antecedent in antecedents:
      colname, colvalue = antecedent.split("@@@")
      antecedents_dict[colname] = colvalue

    colname, colvalue = list(rule['consequents'])[0].split("@@@")
    consequents_dict = { colname: colvalue }
    
    rule_dict.append(
        {'index': index, 'antecedents': antecedents_dict, 'consequents': consequents_dict})
    index += 1

display(pd.DataFrame(rule_dict))


Unnamed: 0,index,antecedents,consequents
0,0,{'petal width in cm': '1'},{'class': 'Iris-setosa'}
1,1,{'petal length in cm': '2'},{'class': 'Iris-setosa'}
2,2,{'petal width in cm': '4'},{'class': 'Iris-versicolor'}
3,3,{'sepal length in cm': '2'},{'class': 'Iris-setosa'}
4,4,{'petal length in cm': '4'},{'class': 'Iris-versicolor'}
5,5,{'petal width in cm': '7'},{'class': 'Iris-virginica'}
6,6,{'petal width in cm': '8'},{'class': 'Iris-virginica'}
7,7,{'petal length in cm': '9'},{'class': 'Iris-virginica'}
8,8,{'sepal length in cm': '9'},{'class': 'Iris-virginica'}
9,9,{'sepal length in cm': '7'},{'class': 'Iris-virginica'}


In [11]:
training_set = raw_data[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX]].copy()
_, training_set = apply_discretization(training_set, bin_thresh)
training_set

Unnamed: 0,sepal length in cm,petal length in cm,petal width in cm,class
0,sepal length in cm@@@2,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa
1,sepal length in cm@@@1,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa
2,sepal length in cm@@@0,petal length in cm@@@0,petal width in cm@@@1,class@@@Iris-setosa
3,sepal length in cm@@@0,petal length in cm@@@2,petal width in cm@@@1,class@@@Iris-setosa
4,sepal length in cm@@@2,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa
...,...,...,...,...
145,sepal length in cm@@@8,petal length in cm@@@7,petal width in cm@@@8,class@@@Iris-virginica
146,sepal length in cm@@@7,petal length in cm@@@7,petal width in cm@@@7,class@@@Iris-virginica
147,sepal length in cm@@@7,petal length in cm@@@7,petal width in cm@@@7,class@@@Iris-virginica
148,sepal length in cm@@@6,petal length in cm@@@8,petal width in cm@@@8,class@@@Iris-virginica


In [12]:
def run_cba_cb(dataset, rule_dict, class_name):
    dataset_copy = dataset.copy()
    classifier = []
    default_class = None
    debug_print = False

    for rule in rule_dict:
        temp = []
        marked = False
        matching_count_for_rule = 0
        no_of_errors_for_rule = 0

        if debug_print:
            print(
                "Checking rule {} -> {}".format(rule['antecedents'], rule['consequents']))

        for row in dataset_copy.iterrows():
            # Check antecedents
            matching_antecedents = True
            for antecedent_name, antecedent_value in rule['antecedents'].items():
                row_antecedent_name, row_antecedent_value = row[1][antecedent_name].split(
                    "@@@")
                if antecedent_value.strip() != row_antecedent_value.strip():
                    matching_antecedents = False

            # Check consequents
            matching_consequents = True
            for consequents_name, consequents_value in rule['consequents'].items():
                row_consequents_name, row_consequents_value = row[1][consequents_name].split(
                    "@@@")
                if consequents_value.strip() != row_consequents_value.strip():
                    matching_consequents = False

            # Store d.id in temp and mark r if it correctly classifies d
            if matching_antecedents and matching_consequents:
                temp.append(row[0])
                marked = True

            # Log matching and error counts for r
            if matching_antecedents:
                matching_count_for_rule += 1
                if not matching_consequents:
                    no_of_errors_for_rule += 1

            # Record guess in actual dataset
            dataset.at[row[0],
                       'guess'] = consequents_value
        if debug_print:
            print('No of matching rows: ', matching_count_for_rule)
            print('No of errors: ', no_of_errors_for_rule)
            print('Marked: ', marked)

        if marked:
            # Delete all the cases with the ids in temp from D
            for id in temp:
                dataset_copy = dataset_copy.drop(id)

            # Selecting a default class for the current C
            highest_count = 0
            highest_count_class = None
            remaining_item_count = 0
            for col, value in dataset_copy[class_name].value_counts().items():
                if value > highest_count:
                    highest_count = value
                    highest_count_class = col
                remaining_item_count += value

            default_class = (highest_count_class, highest_count)

            total_rule_errors = no_of_errors_for_rule
            for classifier_rule in classifier:
                total_rule_errors += classifier_rule['rule_errors']

            # Compute the total number of errors of C
            current_error_count = total_rule_errors + \
                (remaining_item_count - highest_count)

            if debug_print:
                print('Default class: ', default_class)
                print('Remaining rows: ', remaining_item_count)
                print('Total rule errors in C: ', total_rule_errors)
                print('Total no of errors of C: ', current_error_count)

            # Insert r at the end of C
            rule['index'] = len(classifier)
            rule['rule_errors'] = no_of_errors_for_rule
            rule['total_errors'] = no_of_errors_for_rule + \
                (remaining_item_count - highest_count)
            classifier.append(rule)

    # Discard rules in C that do not improve the accuracy of the classifier
    lowest_total_errors_index = min(
        classifier, key=lambda x: x['total_errors'])['index']
    classifier = classifier[:lowest_total_errors_index + 1]

    # Append default class
    default_class_rule = {}
    default_class_rule['index'] = len(classifier)
    default_class_rule['consequents'] = {
        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]: default_class[0].split("@@@")[1]}
    classifier.append(default_class_rule)

    dataset.fillna(default_class[0].split("@@@")[1])

    return classifier


classifier = run_cba_cb(training_set, rule_dict,
                        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1])
end = time.process_time()
print('Time elapsed:', round(end - start, 2), '(s)')
display(pd.DataFrame(classifier))
display(training_set)


Time elapsed: 0.16 (s)


Unnamed: 0,index,antecedents,consequents,rule_errors,total_errors
0,0,{'petal width in cm': '1'},{'class': 'Iris-setosa'},0.0,65.0
1,1,{'petal length in cm': '2'},{'class': 'Iris-setosa'},0.0,56.0
2,2,{'petal width in cm': '4'},{'class': 'Iris-versicolor'},1.0,37.0
3,3,{'sepal length in cm': '2'},{'class': 'Iris-setosa'},3.0,37.0
4,4,{'petal length in cm': '4'},{'class': 'Iris-versicolor'},0.0,27.0
5,5,{'petal width in cm': '7'},{'class': 'Iris-virginica'},0.0,27.0
6,6,{'petal width in cm': '8'},{'class': 'Iris-virginica'},0.0,20.0
7,7,{'petal length in cm': '9'},{'class': 'Iris-virginica'},0.0,16.0
8,8,{'sepal length in cm': '7'},{'class': 'Iris-virginica'},4.0,15.0
9,9,,{'class': 'Iris-versicolor'},,


Unnamed: 0,sepal length in cm,petal length in cm,petal width in cm,class,guess
0,sepal length in cm@@@2,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa,Iris-setosa
1,sepal length in cm@@@1,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa,Iris-setosa
2,sepal length in cm@@@0,petal length in cm@@@0,petal width in cm@@@1,class@@@Iris-setosa,Iris-setosa
3,sepal length in cm@@@0,petal length in cm@@@2,petal width in cm@@@1,class@@@Iris-setosa,Iris-setosa
4,sepal length in cm@@@2,petal length in cm@@@1,petal width in cm@@@1,class@@@Iris-setosa,Iris-setosa
...,...,...,...,...,...
145,sepal length in cm@@@8,petal length in cm@@@7,petal width in cm@@@8,class@@@Iris-virginica,Iris-virginica
146,sepal length in cm@@@7,petal length in cm@@@7,petal width in cm@@@7,class@@@Iris-virginica,Iris-virginica
147,sepal length in cm@@@7,petal length in cm@@@7,petal width in cm@@@7,class@@@Iris-virginica,Iris-virginica
148,sepal length in cm@@@6,petal length in cm@@@8,petal width in cm@@@8,class@@@Iris-virginica,Iris-virginica


# Part 2

In [13]:
# Check classifier accuracy
correct_guesses = len(training_set[training_set['guess'] ==
                                   training_set[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]].apply(lambda x: x.split("@@@")[1])])
accuracy = correct_guesses / len(training_set)

print('Accuracy:', round(accuracy, 2))


Accuracy: 0.82



|Dataset|Number of Instances|Number of Instances  after cleaning|Categorical Attributes|Numerical Attributes|Predictions|
|-------|-------------------|-----------------------------------|----------------------|--------------------|-----------|
Iris|150|150|class|sepal length in cm, sepal width in cm, petal length in cm, petal width in cm|Predicts the class of the Iris
Breast Cancer|286|277|class, age, menopause, tumor-size,inv-nodes, node-caps, breast, breast-quad, irradiat|deg-malig|Predicts for relapse of Breast Cancer
Wine|178|178|type|alcohol, malic_acid, ash, alcalinity, magnesium, total_phenols, flavanoids, nonflavanoid+phenols, proanthocyanins, color_inten, hue, od280-od315, proline|Predicts the wine type based on its chemical contents
Heart Failure Clinical Records|299|299| diabetes, high_blood_pressure, sex, smoking, DEATH_EVENT|age, anaemia, creatinine_phosphokinase, ejection_fraction, platelets, serum_creatinine, serum_sodium, time|Predicts the the chance of death due to heart failure based on health parameters
Zoo|100|99|animal_name, hair, feathers, eggs, milk, airborne, aquatic, predator, toothed, backbone, breathes, venomous, fins, tail, legs, domestic, catsize,type|-|Predicts the animal type based on its characteristics.
Adults|32560|30161|workclass, education, education_num, marital_status, occupation, relationship, race, sex, native_country, incomeage, fnlwgt,capital_gain, capital_loss, hours_per_week|Predicts the income of an adult based on their different characteristics, background, finances and education
Car Evaluation|1727|1727|buying, maintenance, doors, person, lug_boot, safety, class|-|Predicts the class of car based its different characteristics and needs. 
Bank Marketing|45211|44923|job, marital, education, default, housing, month, loan, contact, poutcome|age, balance, day, duration, campaign, pdays, previous|Predicts the chance of client subscribing to a term deposit based on their details.



|Dataset|Min Support|Min Confidence|No. of Rules|Runtime (s)|Accuracy Score|
|-------|-----------|--------------|------------|-----------|--------------|
Iris|0.1|0.5|10|0.19|0.82
Breast Cancer|0.1|0.5|6|0.30|0.86
Wine|0.08|0.7|6|0.20|0.75
Heart Failure Clinical Records|0.05|0.5|7|1.06|0.85
Zoo|0.05|0.5|7|1.53|0.91
Adults|0.05|0.5|3|158.11|0.99
Car Evaluation|0.01|0.3|11|4.00|0.95
Bank Marketing|0.05|0.5|9|466.95|0.88



# Part 3

|Dataset|Decision Tree Accuracy/Fscore|Random Forest Accuracy/Fscore|K-Nearest-Neighbour Accuracy/Fscore|Support Vector Machines Accuracy/Fscore|Neural Network Accuracy/Fscore|
|-------|-----------------------------|-----------------------------|-----------------------------------|---------------------------------------|------------------------------|
Iris|0.95/0.95|0.92/0.92|1.0/1.0|0.95/0.95|1.0/1.0
Breast Cancer|0.77/0.77|0.74/0.75|0.83/0.82|0.83/0.81|0.83/0.82
Wine|0.87/0.87|0.87/0.87|0.84/0.85|0.73/0.74|0.89/0.89
Heart Failure Clinical Records|0.71/0.45|0.77/0.48|0.71/0.15|0.69/0.59|0.76/0.71
Zoo|0.76/0.74|0.72/0.69|0.72/0.67|0.72/0.63|0.72/0.74
Adults|0.84 / 0.82|0.83/0.82|0.84 / 0.83|0.79/0.75|0.79/0.75
Car Evaluation|0.81/0.79|0.81/0.79|0.82/0.80|0.81/0.79|0.81/0.79
Bank Marketing|0.86/0.87|0.90/0.90|0.89/0.87|0.89/0.84|0.88/0.86



# Part 4

### Bootstrap Algorithm

In [14]:
def bootstraps(data, n_elements):
    #initialize output dictionary & unique value count
    dc   = {}
    unip = 0

    #get sample size
    b_size = data.shape[0]
    #get list of row indexes
    idx = [i for i in range(b_size)]
    #loop through the required number of bootstraps
    for b in range(n_elements):
        #obtain boostrap samples with replacement
        sidx   = np.random.choice(idx,replace=True,size=b_size)
        b_samp = []
        
        for x in sidx:
            b_samp.append(x)

        #b_samp = data[sidx,:]
        #compute number of unique values contained in the bootstrap sample
        unip  += len(set(sidx))
        #obtain out-of-bag samples for the current b
        oidx   = list(set(idx) - set(sidx))
        o_samp = []

        if oidx:
            for y in oidx:
                o_samp.append(y)
                
        #store results
        dc['boot_'+str(b)] = {'boot':b_samp,'test':o_samp}
    #return the bootstrap results
    return(dc)

 <font size = '5'> 1. Extraction of sub-datasets from the train dataset which will be used for ensembling later </font>
 <br> &emsp; a. Selection of the sub-datasets will be through numpy.random.choice on the index.
 <br> &emsp; b. Stacking of selection is allowed as all sub-datasets are independent.


In [15]:
NELEMENT = 2
sampleData = bootstraps(dataset, NELEMENT)

In [16]:
def convertData(sampleData, b):
    ttData = {}
    trainingIndex = sampleData['boot_'+str(b)]['boot']
    testingIndex = sampleData['boot_'+str(b)]['test']

    trainingDf = pd.DataFrame()
    testingDf = pd.DataFrame()

    for x in trainingIndex:
        trainingDf = trainingDf.append(dataset.iloc[x])

    for x in testingIndex:
        testingDf = testingDf.append(dataset.iloc[x])

    trainingDf = trainingDf.reset_index()
    trainingDf = trainingDf.drop(columns='index')

    testingDf = testingDf.reset_index()
    testingDf = testingDf.drop(columns='index')

    ttData['boot_'+str(b)] = {'training':trainingDf,'testing':testingDf}

    return(ttData)

In [17]:
oData = []
for x in range(NELEMENT):
    oData.append(convertData(sampleData, x))


  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])


<font size = '5'> 2. Run the CBA-CB classifier model on all the sub-datasets </font>


In [18]:
trainingDf = {}
testingDf = {}
for x in range(NELEMENT):
    trainingDf[x] = run_cba_cb(oData[x]['boot_'+str(x)]['training'], rule_dict,
                        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1])
    display(trainingDf[x])

    testingDf[x] = run_cba_cb(oData[x]['boot_'+str(x)]['testing'], rule_dict,
                        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1])
    display(trainingDf[x])


[{'index': 0,
  'antecedents': {'petal width in cm': '1'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 63},
 {'index': 1,
  'antecedents': {'petal length in cm': '2'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 49},
 {'index': 2,
  'antecedents': {'petal width in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 0,
  'total_errors': 38},
 {'index': 3,
  'antecedents': {'sepal length in cm': '2'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 3,
  'total_errors': 40},
 {'index': 4,
  'antecedents': {'petal length in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 0,
  'total_errors': 34},
 {'index': 5,
  'antecedents': {'petal width in cm': '7'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_errors': 0,
  'total_errors': 34},
 {'index': 6,
  'antecedents': {'petal width in cm': '8'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_err

[{'index': 0,
  'antecedents': {'petal width in cm': '1'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 23},
 {'index': 1,
  'antecedents': {'petal length in cm': '2'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 22},
 {'index': 2,
  'antecedents': {'petal width in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 1,
  'total_errors': 15},
 {'index': 3,
  'antecedents': {'sepal length in cm': '2'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 2,
  'total_errors': 15},
 {'index': 4,
  'antecedents': {'petal length in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 0,
  'total_errors': 9},
 {'index': 5,
  'antecedents': {'petal width in cm': '7'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_errors': 0,
  'total_errors': 9},
 {'index': 6,
  'antecedents': {'petal width in cm': '8'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_error

[{'index': 0,
  'antecedents': {'petal width in cm': '1'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 52},
 {'index': 1,
  'antecedents': {'petal length in cm': '2'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 46},
 {'index': 2,
  'antecedents': {'petal width in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 0,
  'total_errors': 29},
 {'index': 3,
  'antecedents': {'petal length in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 0,
  'total_errors': 23},
 {'index': 4,
  'antecedents': {'petal width in cm': '7'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_errors': 0,
  'total_errors': 23},
 {'index': 5,
  'antecedents': {'petal width in cm': '8'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_errors': 0,
  'total_errors': 19},
 {'index': 6,
  'antecedents': {'petal length in cm': '9'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_

[{'index': 0,
  'antecedents': {'petal width in cm': '1'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 24},
 {'index': 1,
  'antecedents': {'petal length in cm': '2'},
  'consequents': {'class': 'Iris-setosa'},
  'rule_errors': 0,
  'total_errors': 19},
 {'index': 2,
  'antecedents': {'petal width in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 1,
  'total_errors': 20},
 {'index': 4,
  'antecedents': {'petal length in cm': '4'},
  'consequents': {'class': 'Iris-versicolor'},
  'rule_errors': 0,
  'total_errors': 13},
 {'index': 5,
  'antecedents': {'petal width in cm': '7'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_errors': 0,
  'total_errors': 11},
 {'index': 6,
  'antecedents': {'petal width in cm': '8'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_errors': 0,
  'total_errors': 7},
 {'index': 6,
  'antecedents': {'petal length in cm': '9'},
  'consequents': {'class': 'Iris-virginica'},
  'rule_e

<font size= '5'> 3. All prediction model results will be ensemble to output the final model accuracy. </font>

In [19]:
oAccuracy = 0
for x in range(NELEMENT):

    trainingDf = oData[x]['boot_'+str(x)]['training']
    correct_guesses = len(trainingDf[trainingDf['guess'] ==
                                    trainingDf[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]].apply(lambda x: x.split("@@@")[1])])

    testingDf = oData[x]['boot_'+str(x)]['testing']
    correct_guesses2 = len(testingDf[testingDf['guess'] ==
                                    testingDf[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]].apply(lambda x: x.split("@@@")[1])])

    accuracy = correct_guesses / len(trainingDf)
    accuracy2 = correct_guesses2 / len(testingDf)
    oAccuracy += (accuracy + accuracy2)

print('Accuracy:', round(oAccuracy/(NELEMENT*2), 2))

Accuracy: 0.81


### Comparision of Result before & after Bagging Ensemble Application 


| |CBA-CB|CBA-CB|CBA-CB with Bagging Ensemble |CBA-CB with Bagging Ensemble | | |
|-------|---|--|-------------|---------------|----------------|----------|
|Dataset|Runtime (s)|Accuracy Score|Runtime (s)|Accuracy Score|N_elements|Setting|Improvement|
Iris|0.94|0.75|1.12|0.82|2| Lower
Breast Cancer|0.30|0.86|0.94|0.86|3| No Changes
Wine|1.26|0.75|0.97|0.76|2| Higher
Heart Failure Clinical Records|1.06|0.85|0.64|0.86|5| Higher
Zoo|1.53|0.95|0.57|0.95|10| No Changes
Adults|158.11|0.99|207|0.99|10| No Changes
Car Evaluation|4.00|0.95|4.95|0.94|3| Lower
Bank Marketing|466.95|0.88|800.55|0.89|5| Higher

