# COMP47590: Advanced Machine Learning
# Assignment 1: Multi-label Classification

Name(s): Raphael Hetherington

Student Number(s): 18200573

## Import Packages Etc

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score
# import other useful packages

## Task 0: Load the Yeast Dataset

In [66]:
# Write your code here
data = pd.read_csv("yeast.csv")
data


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.041850,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.007970,0.049113,-0.030580,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.007670,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


## Task 1: Implement the Binary Relevance Algorithm

In [131]:
'''
To use the Binary Relevance class, you need to pass in the specific list of labels that you want to choose from.
The algorithm will then go through each, create a model based on each, and then aggregate the findings. 

'''
class BinaryRelevance():
    def __init__(self, classifier, class_labels):
        # pass in a classifier object
        # the number of class labels is necessary so that we know where to start slicing from 
        self.classifier = classifier
        self.class_labels = class_labels # should be a list
        self.models = {}
        self.labels_series = pd.Series(self.class_labels)
         
    
    def train(self, data_to_train):
        # first step is simply to pass in a subset of the data. 
        # We just want one batch of the data and another batch
        '''
        Step 1: 
            For each class label we have to strip away the others, and make a model with the class label.
            We're ultimately going to be using an aggregation of these models, so we store the models that 
            we create in the models dictionary.
        ''' 
        # the features are all the columns that are not the class labels
        features = data_to_train[data_to_train.columns[~data_to_train.columns.isin(self.labels_series)]]
        
        # for each class label we create a new model
        for class_label in self.class_labels:
            # the model is stored in self.models
            self.models[class_label] = self.classifier()
            # we select the class label column as our y
            y = data_to_train[[class_label]]
            # we train the model
            self.models[class_label].fit(features, y.values.ravel())
    
#     # I'm not sure if a test method is necessary. I left this here because I was working on it, 
#.       but you can test the data simply by comparing the prediction with the test data set.
#     def test(self, data_to_test):
#         features = data_to_test[data_to_test.columns[~data_to_test.columns.isin(self.labels_series)]]
#         for class_label in self.class_labels:
#                 model = self.models[class_label]
#                 prediction = model.predict(features)
#                 print(prediction)
                

    # Inputs: this method receives a dataframe WITHOUT class labels and returns a dataframe with the class labels predicted
    def predict(self, features):
            return_frame = features
            for class_label in self.class_labels:
                model = self.models[class_label]# select the appropriate model from the dictionary
                prediction = model.predict(features) 
                prediction_frame = pd.DataFrame(data=prediction, columns=[class_label]) # create a new df with the prediction
                return_frame = return_frame.reset_index(drop=True) # reset index
                prediction_frame = prediction_frame.reset_index(drop=True) # reset index
                return_frame = pd.concat([return_frame, prediction_frame], axis=1) # concatenate the class label with the features
            return return_frame
            
                


### Usage

In [132]:
train, test = np.split(data, [int(.7*len(data))]) # I split the dataframe into 70% train and 30% test
train # 1691 rows
test # 726 rows


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
1691,0.098609,0.021142,0.114095,-0.019634,0.067450,-0.033799,-0.079402,0.008213,0.028437,0.033689,...,0,0,0,1,1,0,0,0,0,0
1692,-0.012358,0.128307,0.166101,0.021330,0.063760,-0.029857,-0.032447,0.090522,-0.024394,-0.036240,...,0,1,1,0,0,0,0,0,0,0
1693,0.004204,0.059006,-0.001850,0.047904,-0.042538,0.076276,-0.055510,0.081995,0.061146,0.130098,...,1,0,0,0,0,0,0,1,1,0
1694,-0.040390,0.386083,-0.111378,-0.196421,-0.104228,-0.204434,-0.033160,-0.054812,-0.072045,-0.113054,...,1,1,0,0,0,0,0,1,1,0
1695,0.014664,0.349167,0.080306,0.147173,0.090360,0.014918,-0.050594,0.122519,0.039955,0.037153,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


In [133]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB # our test classifier
class_labels = ['Class1', 'Class2', 'Class3', 'Class4', 'Class5', 'Class6', 'Class7', 'Class8', 'Class9', 'Class10', 'Class11', 'Class12', 'Class13', 'Class14']
br_classifier = BinaryRelevance(GaussianNB, class_labels) 

In [134]:
# train
br_classifier.train(train)


In [135]:
test

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
1691,0.098609,0.021142,0.114095,-0.019634,0.067450,-0.033799,-0.079402,0.008213,0.028437,0.033689,...,0,0,0,1,1,0,0,0,0,0
1692,-0.012358,0.128307,0.166101,0.021330,0.063760,-0.029857,-0.032447,0.090522,-0.024394,-0.036240,...,0,1,1,0,0,0,0,0,0,0
1693,0.004204,0.059006,-0.001850,0.047904,-0.042538,0.076276,-0.055510,0.081995,0.061146,0.130098,...,1,0,0,0,0,0,0,1,1,0
1694,-0.040390,0.386083,-0.111378,-0.196421,-0.104228,-0.204434,-0.033160,-0.054812,-0.072045,-0.113054,...,1,1,0,0,0,0,0,1,1,0
1695,0.014664,0.349167,0.080306,0.147173,0.090360,0.014918,-0.050594,0.122519,0.039955,0.037153,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


## Task 2: Implement the Binary Relevance Algorithm with Under-Sampling
Our objective is to balance the class distribution for each label. We can do this by:  
1) Evaluating the class distribution for each class  
2) If the classes are imbalanced, we can remove a subset of the data to balance them. 

To enact undersampling, I'm going to create a subclass of the BinaryRelevance class, called BinaryRelevanceWithUnderSampling. This class will have a method - balance_classes_and_train. This method will:
- assess each label and the distribution of classes
- produce a balanced dataframe **if required**
- train a model
- revert back to the original "full" dataframe and repeat so as to keep as much data available as possible.

In [136]:
class BinaryRelevanceWithUnderSampling(BinaryRelevance):
    def __init__(self, classifier, labels):
        super().__init__(classifier, labels)
    
    
    def balance_classes_and_train(self, data_to_train):
        
        for class_label in self.class_labels:
            # first check the distribution of classes 
            with_label = data_to_train.loc[data_to_train[class_label] == 1]
            without_label = data_to_train.loc[data_to_train[class_label] == 0]
            # find the majority and minority of with_label/without_label
            shorter = with_label if len(with_label) < len(without_label) else without_label
            longer = with_label if len(with_label) > len(without_label) else without_label
#             take a subset of the majority class that is the same length as the minority class
#             
            sub_sample = longer.sample(n=len(shorter), random_state=42)
#           create a new data frame that combines the two
            sub_sampled_df = pd.concat([shorter, sub_sample]).reset_index()
            # create classifier
            self.models[class_label] = self.classifier()
            features = sub_sampled_df[sub_sampled_df.columns[~sub_sampled_df.columns.isin(self.labels_series)]]
            features = features.drop(labels='index', axis=1)
            y = sub_sampled_df[[class_label]]
            self.models[class_label].fit(features, y.values.ravel())


In [137]:
# # BR 
br_classifier = BinaryRelevance(GaussianNB, class_labels) 
br_classifier.train(train)
# undersampling
br_undersampling_classifier = BinaryRelevanceWithUnderSampling(GaussianNB, class_labels) 
br_undersampling_classifier.balance_classes_and_train(train)

In [138]:
# test
test_features = test[test.columns[~test.columns.isin(class_labels)]]
test_features

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103
1691,0.098609,0.021142,0.114095,-0.019634,0.067450,-0.033799,-0.079402,0.008213,0.028437,0.033689,...,-0.002545,-0.011628,-0.024093,-0.015627,-0.011493,-0.006109,-0.045359,-0.022919,0.026548,0.150880
1692,-0.012358,0.128307,0.166101,0.021330,0.063760,-0.029857,-0.032447,0.090522,-0.024394,-0.036240,...,-0.042393,-0.064163,0.185626,-0.060137,-0.066107,0.016158,-0.074661,-0.073637,-0.042459,-0.152053
1693,0.004204,0.059006,-0.001850,0.047904,-0.042538,0.076276,-0.055510,0.081995,0.061146,0.130098,...,-0.055143,0.206042,-0.034867,0.151249,0.144882,-0.098875,-0.053172,-0.077365,0.010706,0.035970
1694,-0.040390,0.386083,-0.111378,-0.196421,-0.104228,-0.204434,-0.033160,-0.054812,-0.072045,-0.113054,...,0.002193,-0.029572,-0.028112,-0.021491,-0.027989,-0.022536,0.137120,0.081262,0.014566,0.105380
1695,0.014664,0.349167,0.080306,0.147173,0.090360,0.014918,-0.050594,0.122519,0.039955,0.037153,...,0.144818,-0.076249,0.108530,-0.039793,-0.046487,-0.072883,0.100437,-0.048302,-0.067825,0.013865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0.024084,-0.055915,-0.055593,-0.049642,0.018571,0.068742,-0.061001,-0.081132,-0.065844,0.001267
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,-0.079992,-0.075444,0.294987,-0.076379,-0.076293,-0.072451,-0.052258,-0.040026,0.342176,-0.169668
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,-0.006624,-0.036850,-0.064831,-0.068696,-0.068521,-0.039841,0.274575,-0.066957,0.260121,-0.125303
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0.085087,0.033166,-0.012710,0.135359,0.213512,-0.107561,-0.081925,-0.122332,-0.022453,0.001953


In [139]:
binary_relevance_without_undersampling = br_classifier.predict(test_features)

In [140]:
binary_relevance_with_undersampling = br_undersampling_classifier.predict(test_features)


## Task 3: Compare the Performance of Different Binary Relevance Approaches



In [127]:
'''
Evaluation of multi-label classifiers
-------------------------------------

Approaches we can take: 
- micro-averaging
    - for each label, find TPs, TNs, FPs and FNs
    - using the sums of those, derive an aggregated accuracy, precision, recall and F1 metric. 
- macro-averaging
    - Use the entire prediction set to determine accuracy and other metrics
- hamming loss
    - The fraction of incorrect labels to total labels. 

'''

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss




### i. Creating metrics for both approaches
In order to compare the performance of the algorithms with and without undersampling, I first need to evaluate the classifiers. For this I will use the test holdout set 

## Hamming Loss

In [157]:
# Binary Relevance: 

# get all class labels
test_class_label_results = test.loc[:, 'Class1':]
prediction_class_label_results_with_undersampling = binary_relevance_with_undersampling.loc[:, 'Class1':]
prediction_class_label_results_without_undersampling = binary_relevance_without_undersampling.loc[:, 'Class1':]

# reset indices
prediction_class_label_results_with_undersampling.reset_index()
prediction_class_label_results_without_undersampling.reset_index()
test_class_label_results = test_class_label_results.reset_index()
test_class_label_results = test_class_label_results.drop("index", axis=1)

# prediction_class_label_results
hamming_loss(test_class_label_results, prediction_class_label_results_without_undersampling) # 0.3018..
hamming_loss(test_class_label_results, prediction_class_label_results_with_undersampling) # 0.3798...
# # 0.302 - not great.

0.37987012987012986

### Results:
- without undersampling: 30% correct
- with undersampling: 38% correct

## Task 4: Implement the Classifier Chains Algorithm

In [6]:
# Write your code here


## Task 5: Evaluate the Performance of the Classifier Chains Algorithm

In [7]:
# Write your code here


## Task 6: Reflect on the Performance of the Different Models Evaluated

*Write your reflection here (max 300 words)*