In [1]:
import pandas as pd
import numpy as np
import time
import sklearn
import copy

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
DATASETS = ['Iris', 'Breast Cancer', 'Wine', 'Heart Failure Clinical Records',
            'Zoo', 'Adult', 'Car Evaluation', 'Bank Marketing']

IRIS_ATTRIBUTES = []
with open('datasets/Iris/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            IRIS_ATTRIBUTES.append(i.strip())

IRIS_CONFIG = {
    'min_support': 0.1,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

BREAST_CANCER_ATTRIBUTES = []
with open('datasets/Breast Cancer/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            BREAST_CANCER_ATTRIBUTES.append(i.strip())

BREAST_CANCER_CONFIG = {
    'min_support': 0.1,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

WINE_ATTRIBUTES = []
with open('datasets/Wine/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            WINE_ATTRIBUTES.append(i.strip())

WINE_CONFIG = {
    'min_support': 0.08,
    'metric': 'confidence',
    'min_threshold': 0.7,
}

HEART_FAILURE_ATTRIBUTES = []
with open('datasets/Heart Failure Clinical Records/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            HEART_FAILURE_ATTRIBUTES.append(i.strip())

HEART_FAILURE_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

ZOO_ATTRIBUTES = []
with open('datasets/Zoo/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            ZOO_ATTRIBUTES.append(i.strip())

ZOO_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

ADULT_ATTRIBUTES = []
with open('datasets/Adult/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            ADULT_ATTRIBUTES.append(i.strip())

ADULT_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

CAR_ATTRIBUTES = []
with open('datasets/Car Evaluation/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            CAR_ATTRIBUTES.append(i.strip())

CAR_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

BANK_ATTRIBUTES = []
with open('datasets/Bank Marketing/cleaned_attributes.txt') as f:
    for line in f:
        for i in line[1:-1].replace('\'', "").split(','):
            BANK_ATTRIBUTES.append(i.strip())

BANK_CONFIG = {
    'min_support': 0.05,
    'metric': 'confidence',
    'min_threshold': 0.5,
}

DATASET_ATTRIBUTES = [IRIS_ATTRIBUTES, BREAST_CANCER_ATTRIBUTES, WINE_ATTRIBUTES,
                      HEART_FAILURE_ATTRIBUTES, ZOO_ATTRIBUTES, ADULT_ATTRIBUTES, CAR_ATTRIBUTES, BANK_ATTRIBUTES]
CONFIGS = [IRIS_CONFIG, BREAST_CANCER_CONFIG, WINE_CONFIG,
           HEART_FAILURE_CONFIG, ZOO_CONFIG, ADULT_CONFIG, CAR_CONFIG, BANK_CONFIG]

SELECT_DATASET_INDEX = 2

NELEMENT = 2

In [3]:
# Read Dataset
raw_data = pd.read_csv('datasets\\' + DATASETS[SELECT_DATASET_INDEX] + '\\data.csv', index_col=False)
raw_data = pd.DataFrame(raw_data)
raw_data.head()

Unnamed: 0,type,alcohol,malic_acid,ash,alcalinity,magnesium,total_phenols,flavanoids,nonflavanoid+phenols,proanthocyanins,color_inten,hue,od280-od315,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
# Split attributes and class
dataset = raw_data[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX]]

In [5]:
# Apply discretization on continuous values
def find_bin_thresholds(original_dataset, splits=10):
  bin_thresh = {}

  for col in original_dataset.columns:
    if original_dataset[col].dtype in ['int64', 'float64']:
      bin_thresh_unfiltered_list = []
      all_row_values = np.array(original_dataset[col].tolist())
      for i in range(splits - 1):
        bin_thresh_unfiltered_list.append(
            np.percentile(all_row_values, (i + 1) * 10))
      bin_thresh[col] = np.unique(bin_thresh_unfiltered_list)

  return bin_thresh

def apply_discretization(original_dataset, bin_thresh=None):
  discretized_dataset = original_dataset.copy()

  if bin_thresh == None:
    bin_thresh = find_bin_thresholds(discretized_dataset) 

  # Put continuous values into bins
  for colname, references in bin_thresh.items():
    col_in_numpy_arr = discretized_dataset[colname].to_numpy()
    col_with_bin_no = np.digitize(col_in_numpy_arr, bins=references)
    col_with_bin_no = np.array(['{}'.format(val)
                              for val in col_with_bin_no])
    discretized_dataset[colname] = col_with_bin_no

  # Add column name to value for later use
  for colname, _ in discretized_dataset.items():
    discretized_dataset[colname] = discretized_dataset[colname].apply(
        lambda x: "{}@@@{}".format(colname, x))

  return bin_thresh, discretized_dataset

bin_thresh, dataset = apply_discretization(dataset)

print(bin_thresh)

dataset.head()

{'alcalinity': array([16.  , 16.8 , 18.  , 18.58, 19.5 , 20.  , 21.  , 22.  , 24.  ]), 'malic_acid': array([1.247, 1.51 , 1.651, 1.73 , 1.865, 2.134, 2.679, 3.406, 3.983]), 'nonflavanoid+phenols': array([0.217, 0.26 , 0.28 , 0.3  , 0.34 , 0.39 , 0.43 , 0.48 , 0.53 ]), 'color_inten': array([2.549, 2.916, 3.4  , 4.08 , 4.69 , 5.284, 5.745, 6.99 , 8.53 ]), 'type': array([1., 2., 3.])}


Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type
0,alcalinity@@@0,malic_acid@@@3,nonflavanoid+phenols@@@3,color_inten@@@6,type@@@1
1,alcalinity@@@0,malic_acid@@@4,nonflavanoid+phenols@@@2,color_inten@@@4,type@@@1
2,alcalinity@@@4,malic_acid@@@6,nonflavanoid+phenols@@@4,color_inten@@@6,type@@@1
3,alcalinity@@@2,malic_acid@@@5,nonflavanoid+phenols@@@1,color_inten@@@8,type@@@1
4,alcalinity@@@7,malic_acid@@@6,nonflavanoid+phenols@@@6,color_inten@@@4,type@@@1


In [6]:
# Get frequent itemsets
start = time.process_time()
dataset_list = dataset.values.tolist()

trans_enc = TransactionEncoder()
trans_enc_ary = trans_enc.fit(dataset_list).transform(dataset_list)
encoded_dataset = pd.DataFrame(trans_enc_ary, columns=trans_enc.columns_)

freq_items = apriori(
    encoded_dataset, min_support=CONFIGS[SELECT_DATASET_INDEX]['min_support'], use_colnames=True)
freq_items
    

Unnamed: 0,support,itemsets
0,0.095506,(alcalinity@@@0)
1,0.101124,(alcalinity@@@1)
2,0.101124,(alcalinity@@@2)
3,0.101124,(alcalinity@@@3)
4,0.095506,(alcalinity@@@4)
5,0.123596,(alcalinity@@@6)
6,0.11236,(alcalinity@@@7)
7,0.11236,(alcalinity@@@8)
8,0.11236,(alcalinity@@@9)
9,0.101124,(color_inten@@@0)


In [7]:
# Get association rules
rules = association_rules(
    freq_items, metric=CONFIGS[SELECT_DATASET_INDEX]['metric'], min_threshold=CONFIGS[SELECT_DATASET_INDEX]['min_threshold'])
rules = rules.sort_values(
    ['support', 'confidence'], ascending=[False, False])
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(color_inten@@@0),(type@@@2),0.101124,0.398876,0.101124,1.0,2.507042,0.060788,inf
1,(color_inten@@@1),(type@@@2),0.101124,0.398876,0.101124,1.0,2.507042,0.060788,inf
4,(malic_acid@@@0),(type@@@2),0.101124,0.398876,0.095506,0.944444,2.367762,0.05517,10.820225
2,(color_inten@@@2),(type@@@2),0.089888,0.398876,0.089888,1.0,2.507042,0.054034,inf
3,(color_inten@@@9),(type@@@3),0.101124,0.269663,0.089888,0.888889,3.296296,0.062618,6.573034
5,(malic_acid@@@4),(type@@@1),0.117978,0.331461,0.089888,0.761905,2.298628,0.050783,2.807865


In [8]:
# Filter rules to ensure consequent is the pred class
def remove_unnecessary_rules(rule):
    if rule.__len__() == 1:
        return list(rule)[0].split("@@@")[0] == DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]
    else:
        return False

filtered_rules = rules.copy()
filtered_rules = filtered_rules[filtered_rules['consequents'].apply(
    remove_unnecessary_rules) == True]

# Check if every class is covered
unique_classes_in_rules = np.unique(
    filtered_rules['consequents'].apply(lambda x: list(x)[0].split("@@@")[1]))
unique_classes_in_raw_data = np.unique(
    raw_data[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]])

print(unique_classes_in_rules, unique_classes_in_raw_data)

filtered_rules

['1' '2' '3'] [1 2 3]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(color_inten@@@0),(type@@@2),0.101124,0.398876,0.101124,1.0,2.507042,0.060788,inf
1,(color_inten@@@1),(type@@@2),0.101124,0.398876,0.101124,1.0,2.507042,0.060788,inf
4,(malic_acid@@@0),(type@@@2),0.101124,0.398876,0.095506,0.944444,2.367762,0.05517,10.820225
2,(color_inten@@@2),(type@@@2),0.089888,0.398876,0.089888,1.0,2.507042,0.054034,inf
3,(color_inten@@@9),(type@@@3),0.101124,0.269663,0.089888,0.888889,3.296296,0.062618,6.573034
5,(malic_acid@@@4),(type@@@1),0.117978,0.331461,0.089888,0.761905,2.298628,0.050783,2.807865


In [9]:
# Create classifiers from rules
rule_dict = []

index = 0
for _, rule in filtered_rules.iterrows():
    antecedents = list(rule['antecedents'])
    antecedents_dict = {}
    for antecedent in antecedents:
      colname, colvalue = antecedent.split("@@@")
      antecedents_dict[colname] = colvalue

    colname, colvalue = list(rule['consequents'])[0].split("@@@")
    consequents_dict = { colname: colvalue }
    
    rule_dict.append(
        {'index': index, 'antecedents': antecedents_dict, 'consequents': consequents_dict})
    index += 1

display(pd.DataFrame(rule_dict))


Unnamed: 0,index,antecedents,consequents
0,0,{'color_inten': '0'},{'type': '2'}
1,1,{'color_inten': '1'},{'type': '2'}
2,2,{'malic_acid': '0'},{'type': '2'}
3,3,{'color_inten': '2'},{'type': '2'}
4,4,{'color_inten': '9'},{'type': '3'}
5,5,{'malic_acid': '4'},{'type': '1'}


In [10]:
training_set = raw_data[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX]].copy()
_, training_set = apply_discretization(training_set, bin_thresh)
training_set

Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type
0,alcalinity@@@0,malic_acid@@@3,nonflavanoid+phenols@@@3,color_inten@@@6,type@@@1
1,alcalinity@@@0,malic_acid@@@4,nonflavanoid+phenols@@@2,color_inten@@@4,type@@@1
2,alcalinity@@@4,malic_acid@@@6,nonflavanoid+phenols@@@4,color_inten@@@6,type@@@1
3,alcalinity@@@2,malic_acid@@@5,nonflavanoid+phenols@@@1,color_inten@@@8,type@@@1
4,alcalinity@@@7,malic_acid@@@6,nonflavanoid+phenols@@@6,color_inten@@@4,type@@@1
...,...,...,...,...,...
173,alcalinity@@@6,malic_acid@@@9,nonflavanoid+phenols@@@8,color_inten@@@8,type@@@3
174,alcalinity@@@8,malic_acid@@@8,nonflavanoid+phenols@@@7,color_inten@@@8,type@@@3
175,alcalinity@@@6,malic_acid@@@9,nonflavanoid+phenols@@@7,color_inten@@@9,type@@@3
176,alcalinity@@@6,malic_acid@@@6,nonflavanoid+phenols@@@9,color_inten@@@9,type@@@3


In [11]:
def run_cba_cb(dataset, rule_dict, class_name):
    dataset_copy = dataset.copy()
    classifier = []
    default_class = None
    debug_print = False

    for rule in rule_dict:
        temp = []
        marked = False
        matching_count_for_rule = 0
        no_of_errors_for_rule = 0

        if debug_print:
            print(
                "Checking rule {} -> {}".format(rule['antecedents'], rule['consequents']))

        for row in dataset_copy.iterrows():
            # Check antecedents
            matching_antecedents = True
            for antecedent_name, antecedent_value in rule['antecedents'].items():
                row_antecedent_name, row_antecedent_value = row[1][antecedent_name].split(
                    "@@@")
                if antecedent_value != row_antecedent_value:
                    matching_antecedents = False

            # Check consequents
            matching_consequents = True
            for consequents_name, consequents_value in rule['consequents'].items():
                row_consequents_name, row_consequents_value = row[1][consequents_name].split(
                    "@@@")
                if consequents_value != row_consequents_value:
                    matching_consequents = False

            # Store d.id in temp and mark r if it correctly classifies d
            if matching_antecedents and matching_consequents:
                temp.append(row[0])
                marked = True

            # Log matching and error counts for r
            if matching_antecedents:
                matching_count_for_rule += 1
                if not matching_consequents:
                    no_of_errors_for_rule += 1

            # Record guess in actual dataset
            dataset.at[row[0],
                       'guess'] = consequents_value
        if debug_print:
            print('No of matching rows: ', matching_count_for_rule)
            print('No of errors: ', no_of_errors_for_rule)

        if marked:
            # Delete all the cases with the ids in temp from D
            for id in temp:
                dataset_copy = dataset_copy.drop(id)

            # Selecting a default class for the current C
            highest_count = 0
            highest_count_class = None
            remaining_item_count = 0
            for col, value in dataset_copy[class_name].value_counts().items():
                if value > highest_count:
                    highest_count = value
                    highest_count_class = col
                remaining_item_count += value

            default_class = (highest_count_class, highest_count)

            total_rule_errors = no_of_errors_for_rule
            for classifier_rule in classifier:
                total_rule_errors += classifier_rule['rule_errors']

            # Compute the total number of errors of C
            current_error_count = total_rule_errors + \
                (remaining_item_count - highest_count)

            if debug_print:
                print('Default class: ', default_class)
                print('Remaining rows: ', remaining_item_count)
                print('Total rule errors in C: ', total_rule_errors)
                print('Total no of errors of C: ', current_error_count)

            # Insert r at the end of C
            rule['index'] = len(classifier)
            rule['rule_errors'] = no_of_errors_for_rule
            rule['total_errors'] = no_of_errors_for_rule + \
                (remaining_item_count - highest_count)
            classifier.append(rule)

    # Discard rules in C that do not improve the accuracy of the classifier
    lowest_total_errors_index = min(
        classifier, key=lambda x: x['total_errors'])['index']
    classifier = classifier[:lowest_total_errors_index + 1]

    # Append default class
    default_class_rule = {}
    default_class_rule['index'] = len(classifier)
    default_class_rule['consequents'] = {
        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]: default_class[0].split("@@@")[1]}
    classifier.append(default_class_rule)

    dataset.fillna(default_class[0].split("@@@")[1])

    return classifier


### Bootstraps for pulling random sample 

In [12]:
def bootstraps(data, n_elements):
    #initialize output dictionary & unique value count
    dc   = {}
    unip = 0

    #get sample size
    b_size = data.shape[0]
    #get list of row indexes
    idx = [i for i in range(b_size)]
    #loop through the required number of bootstraps
    for b in range(n_elements):
        #obtain boostrap samples with replacement
        sidx   = np.random.choice(idx,replace=True,size=b_size)
        b_samp = []
        
        for x in sidx:
            b_samp.append(x)

        #compute number of unique values contained in the bootstrap sample
        unip  += len(set(sidx))
        #obtain out-of-bag samples for the current b
        oidx   = list(set(idx) - set(sidx))
        o_samp = []

        if oidx:
            for y in oidx:
                o_samp.append(y)
                
        #store results
        dc['boot_'+str(b)] = {'boot':b_samp,'test':o_samp}
    #return the bootstrap results
    return(dc)

### Apply bootstrap function on the dataset

In [13]:
sampleData = bootstraps(dataset, NELEMENT)
display(sampleData)

{'boot_0': {'boot': [87,
   105,
   37,
   79,
   2,
   99,
   89,
   125,
   75,
   92,
   117,
   14,
   155,
   35,
   102,
   10,
   105,
   22,
   89,
   49,
   24,
   27,
   12,
   174,
   34,
   69,
   58,
   130,
   86,
   17,
   59,
   90,
   114,
   144,
   138,
   24,
   21,
   177,
   49,
   29,
   77,
   156,
   86,
   150,
   75,
   145,
   150,
   137,
   26,
   90,
   49,
   82,
   142,
   170,
   154,
   124,
   68,
   141,
   27,
   129,
   88,
   140,
   156,
   104,
   6,
   124,
   22,
   145,
   176,
   10,
   29,
   87,
   167,
   105,
   54,
   151,
   20,
   168,
   68,
   95,
   55,
   117,
   36,
   27,
   87,
   147,
   165,
   176,
   31,
   13,
   116,
   87,
   22,
   13,
   46,
   85,
   99,
   36,
   112,
   106,
   131,
   158,
   154,
   26,
   66,
   161,
   120,
   78,
   151,
   32,
   131,
   110,
   138,
   94,
   71,
   113,
   110,
   39,
   15,
   13,
   89,
   122,
   93,
   87,
   7,
   8,
   74,
   8,
   0,
   168,
   75,
   14,
   59,
   3

### Retrieve the training and testing data from the ramdomized sampling data and reset their index

In [14]:
def convertData(sampleData, b):
    ttData = {}
    trainingIndex = sampleData['boot_'+str(b)]['boot']
    testingIndex = sampleData['boot_'+str(b)]['test']

    trainingDf = pd.DataFrame()
    testingDf = pd.DataFrame()

    for x in trainingIndex:
        trainingDf = trainingDf.append(dataset.iloc[x])

    for x in testingIndex:
        testingDf = testingDf.append(dataset.iloc[x])

    trainingDf = trainingDf.reset_index()
    trainingDf = trainingDf.drop(columns='index')
    display(trainingDf)

    testingDf = testingDf.reset_index()
    testingDf = testingDf.drop(columns='index')
    display(testingDf)

    ttData['boot_'+str(b)] = {'training':trainingDf,'testing':testingDf}

    return(ttData)

### To combine all the sampling data into one overall list

In [15]:
oData = []
for x in range(NELEMENT):
    oData.append(convertData(sampleData, x))

display(oData)

  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])


Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type
0,alcalinity@@@9,malic_acid@@@3,nonflavanoid+phenols@@@6,color_inten@@@1,type@@@2
1,alcalinity@@@8,malic_acid@@@6,nonflavanoid+phenols@@@9,color_inten@@@1,type@@@2
2,alcalinity@@@3,malic_acid@@@2,nonflavanoid+phenols@@@3,color_inten@@@4,type@@@1
3,alcalinity@@@8,malic_acid@@@8,nonflavanoid+phenols@@@7,color_inten@@@1,type@@@2
4,alcalinity@@@4,malic_acid@@@6,nonflavanoid+phenols@@@4,color_inten@@@6,type@@@1
...,...,...,...,...,...
173,alcalinity@@@8,malic_acid@@@7,nonflavanoid+phenols@@@6,color_inten@@@9,type@@@3
174,alcalinity@@@3,malic_acid@@@1,nonflavanoid+phenols@@@3,color_inten@@@0,type@@@2
175,alcalinity@@@3,malic_acid@@@1,nonflavanoid+phenols@@@4,color_inten@@@3,type@@@2
176,alcalinity@@@7,malic_acid@@@5,nonflavanoid+phenols@@@8,color_inten@@@1,type@@@2


Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type
0,alcalinity@@@0,malic_acid@@@4,nonflavanoid+phenols@@@2,color_inten@@@4,type@@@1
1,alcalinity@@@0,malic_acid@@@4,nonflavanoid+phenols@@@5,color_inten@@@7,type@@@1
2,alcalinity@@@7,malic_acid@@@8,nonflavanoid+phenols@@@0,color_inten@@@5,type@@@3
3,alcalinity@@@2,malic_acid@@@0,nonflavanoid+phenols@@@9,color_inten@@@6,type@@@3
4,alcalinity@@@3,malic_acid@@@6,nonflavanoid+phenols@@@9,color_inten@@@8,type@@@3
...,...,...,...,...,...
59,alcalinity@@@1,malic_acid@@@8,nonflavanoid+phenols@@@7,color_inten@@@3,type@@@2
60,alcalinity@@@4,malic_acid@@@8,nonflavanoid+phenols@@@5,color_inten@@@0,type@@@2
61,alcalinity@@@7,malic_acid@@@9,nonflavanoid+phenols@@@4,color_inten@@@1,type@@@2
62,alcalinity@@@7,malic_acid@@@2,nonflavanoid+phenols@@@6,color_inten@@@3,type@@@2


  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])
  trainingDf = trainingDf.append(dataset.iloc[x])


Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type
0,alcalinity@@@4,malic_acid@@@5,nonflavanoid+phenols@@@5,color_inten@@@4,type@@@1
1,alcalinity@@@6,malic_acid@@@7,nonflavanoid+phenols@@@8,color_inten@@@7,type@@@3
2,alcalinity@@@7,malic_acid@@@5,nonflavanoid+phenols@@@8,color_inten@@@1,type@@@2
3,alcalinity@@@7,malic_acid@@@9,nonflavanoid+phenols@@@9,color_inten@@@3,type@@@3
4,alcalinity@@@7,malic_acid@@@2,nonflavanoid+phenols@@@6,color_inten@@@3,type@@@2
...,...,...,...,...,...
173,alcalinity@@@0,malic_acid@@@7,nonflavanoid+phenols@@@0,color_inten@@@5,type@@@1
174,alcalinity@@@3,malic_acid@@@1,nonflavanoid+phenols@@@3,color_inten@@@0,type@@@2
175,alcalinity@@@6,malic_acid@@@6,nonflavanoid+phenols@@@7,color_inten@@@9,type@@@3
176,alcalinity@@@9,malic_acid@@@5,nonflavanoid+phenols@@@2,color_inten@@@8,type@@@3


Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type
0,alcalinity@@@4,malic_acid@@@6,nonflavanoid+phenols@@@4,color_inten@@@6,type@@@1
1,alcalinity@@@2,malic_acid@@@5,nonflavanoid+phenols@@@1,color_inten@@@8,type@@@1
2,alcalinity@@@7,malic_acid@@@6,nonflavanoid+phenols@@@6,color_inten@@@4,type@@@1
3,alcalinity@@@0,malic_acid@@@4,nonflavanoid+phenols@@@5,color_inten@@@7,type@@@1
4,alcalinity@@@6,malic_acid@@@7,nonflavanoid+phenols@@@1,color_inten@@@6,type@@@3
...,...,...,...,...,...
65,alcalinity@@@7,malic_acid@@@6,nonflavanoid+phenols@@@2,color_inten@@@0,type@@@2
66,alcalinity@@@1,malic_acid@@@8,nonflavanoid+phenols@@@7,color_inten@@@3,type@@@2
67,alcalinity@@@9,malic_acid@@@5,nonflavanoid+phenols@@@7,color_inten@@@7,type@@@2
68,alcalinity@@@7,malic_acid@@@9,nonflavanoid+phenols@@@4,color_inten@@@1,type@@@2


[{'boot_0': {'training':          alcalinity      malic_acid      nonflavanoid+phenols  \
   0    alcalinity@@@9  malic_acid@@@3  nonflavanoid+phenols@@@6   
   1    alcalinity@@@8  malic_acid@@@6  nonflavanoid+phenols@@@9   
   2    alcalinity@@@3  malic_acid@@@2  nonflavanoid+phenols@@@3   
   3    alcalinity@@@8  malic_acid@@@8  nonflavanoid+phenols@@@7   
   4    alcalinity@@@4  malic_acid@@@6  nonflavanoid+phenols@@@4   
   ..              ...             ...                       ...   
   173  alcalinity@@@8  malic_acid@@@7  nonflavanoid+phenols@@@6   
   174  alcalinity@@@3  malic_acid@@@1  nonflavanoid+phenols@@@3   
   175  alcalinity@@@3  malic_acid@@@1  nonflavanoid+phenols@@@4   
   176  alcalinity@@@7  malic_acid@@@5  nonflavanoid+phenols@@@8   
   177  alcalinity@@@7  malic_acid@@@2  nonflavanoid+phenols@@@8   
   
            color_inten      type  
   0    color_inten@@@1  type@@@2  
   1    color_inten@@@1  type@@@2  
   2    color_inten@@@4  type@@@1  
   3    color_

### Run the cba classifier for both training and testing datas

In [16]:
trainingDf = {}
testingDf = {}
for x in range(NELEMENT):
    trainingDf[x] = run_cba_cb(oData[x]['boot_'+str(x)]['training'], rule_dict,
                        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1])
    display(trainingDf[x])

    testingDf[x] = run_cba_cb(oData[x]['boot_'+str(x)]['testing'], rule_dict,
                        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1])
    display(trainingDf[x])
    

[{'index': 0,
  'antecedents': {'color_inten': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 95},
 {'index': 1,
  'antecedents': {'color_inten': '1'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 69},
 {'index': 2,
  'antecedents': {'malic_acid': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 59},
 {'index': 3,
  'antecedents': {'color_inten': '2'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 50},
 {'index': 4,
  'antecedents': {'color_inten': '9'},
  'consequents': {'type': '3'},
  'rule_errors': 4,
  'total_errors': 37},
 {'index': 5,
  'antecedents': {'malic_acid': '4'},
  'consequents': {'type': '1'},
  'rule_errors': 1,
  'total_errors': 34},
 {'index': 6, 'consequents': {'type': '1'}}]

[{'index': 0,
  'antecedents': {'color_inten': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 37},
 {'index': 1,
  'antecedents': {'color_inten': '1'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 33},
 {'index': 2,
  'antecedents': {'malic_acid': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 1,
  'total_errors': 29},
 {'index': 3,
  'antecedents': {'color_inten': '2'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 24},
 {'index': 4,
  'antecedents': {'color_inten': '9'},
  'consequents': {'type': '3'},
  'rule_errors': 0,
  'total_errors': 18},
 {'index': 5,
  'antecedents': {'malic_acid': '4'},
  'consequents': {'type': '1'},
  'rule_errors': 1,
  'total_errors': 19},
 {'index': 6, 'consequents': {'type': '1'}}]

[{'index': 0,
  'antecedents': {'color_inten': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 100},
 {'index': 1,
  'antecedents': {'color_inten': '1'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 94},
 {'index': 2,
  'antecedents': {'malic_acid': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 1,
  'total_errors': 74},
 {'index': 3,
  'antecedents': {'color_inten': '2'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 61},
 {'index': 4,
  'antecedents': {'color_inten': '9'},
  'consequents': {'type': '3'},
  'rule_errors': 1,
  'total_errors': 45},
 {'index': 5, 'consequents': {'type': '1'}}]

[{'index': 0,
  'antecedents': {'color_inten': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 35},
 {'index': 1,
  'antecedents': {'color_inten': '1'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 27},
 {'index': 2,
  'antecedents': {'malic_acid': '0'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 24},
 {'index': 3,
  'antecedents': {'color_inten': '2'},
  'consequents': {'type': '2'},
  'rule_errors': 0,
  'total_errors': 19},
 {'index': 4,
  'antecedents': {'color_inten': '9'},
  'consequents': {'type': '3'},
  'rule_errors': 1,
  'total_errors': 15},
 {'index': 5, 'consequents': {'type': '1'}}]

### To combine all the different model's accuracy data and aggregate them in accordance to number of NELEMENT.

In [17]:
oAccuracy = 0
for x in range(NELEMENT):

    trainingDf = oData[x]['boot_'+str(x)]['training']
    correct_guesses = len(trainingDf[trainingDf['guess'] ==
                                    trainingDf[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]].apply(lambda x: x.split("@@@")[1])])

    testingDf = oData[x]['boot_'+str(x)]['testing']
    correct_guesses2 = len(testingDf[testingDf['guess'] ==
                                    testingDf[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]].apply(lambda x: x.split("@@@")[1])])

    accuracy = correct_guesses / len(trainingDf)
    accuracy2 = correct_guesses2 / len(testingDf)
    oAccuracy += (accuracy + accuracy2)

print('Accuracy:', round(oAccuracy/(NELEMENT*2), 2))

Accuracy: 0.77


### Running the cba classifier on the original training set data

In [18]:
classifier = run_cba_cb(training_set, rule_dict,
                        DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1])
end = time.process_time()
print('Time elapsed:', round(end - start, 2), '(s)')
display(pd.DataFrame(classifier))
display(training_set)

Time elapsed: 0.94 (s)


Unnamed: 0,index,antecedents,consequents,rule_errors,total_errors
0,0,{'color_inten': '0'},{'type': '2'},0.0,101.0
1,1,{'color_inten': '1'},{'type': '2'},0.0,83.0
2,2,{'malic_acid': '0'},{'type': '2'},1.0,72.0
3,3,{'color_inten': '2'},{'type': '2'},0.0,60.0
4,4,{'color_inten': '9'},{'type': '3'},2.0,46.0
5,5,,{'type': '1'},,


Unnamed: 0,alcalinity,malic_acid,nonflavanoid+phenols,color_inten,type,guess
0,alcalinity@@@0,malic_acid@@@3,nonflavanoid+phenols@@@3,color_inten@@@6,type@@@1,1
1,alcalinity@@@0,malic_acid@@@4,nonflavanoid+phenols@@@2,color_inten@@@4,type@@@1,1
2,alcalinity@@@4,malic_acid@@@6,nonflavanoid+phenols@@@4,color_inten@@@6,type@@@1,1
3,alcalinity@@@2,malic_acid@@@5,nonflavanoid+phenols@@@1,color_inten@@@8,type@@@1,1
4,alcalinity@@@7,malic_acid@@@6,nonflavanoid+phenols@@@6,color_inten@@@4,type@@@1,1
...,...,...,...,...,...,...
173,alcalinity@@@6,malic_acid@@@9,nonflavanoid+phenols@@@8,color_inten@@@8,type@@@3,1
174,alcalinity@@@8,malic_acid@@@8,nonflavanoid+phenols@@@7,color_inten@@@8,type@@@3,1
175,alcalinity@@@6,malic_acid@@@9,nonflavanoid+phenols@@@7,color_inten@@@9,type@@@3,3
176,alcalinity@@@6,malic_acid@@@6,nonflavanoid+phenols@@@9,color_inten@@@9,type@@@3,3


### Churning out the original cba classifier accuracy

In [19]:
# Check classifier accuracy
correct_guesses = len(training_set[training_set['guess'] ==
                                   training_set[DATASET_ATTRIBUTES[SELECT_DATASET_INDEX][-1]].apply(lambda x: x.split("@@@")[1])])
accuracy = correct_guesses / len(training_set)

print('Accuracy:', round(accuracy, 2))

Accuracy: 0.75
