# COMP47590: Advanced Machine Learning
# Assignment 1: Multi-label Classification

Name(s): Raphael Hetherington

Student Number(s): 18200573

## Import Packages Etc

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score
# import other useful packages

## Task 0: Load the Yeast Dataset

In [2]:
# Write your code here
data = pd.read_csv("yeast.csv")
data


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.041850,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.007970,0.049113,-0.030580,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.007670,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


## Task 1: Implement the Binary Relevance Algorithm

In [9]:
'''
To use the Binary Relevance class, you need to pass in the specific list of labels that you want to choose from.
The algorithm will then go through each, create a model based on each, and then aggregate the findings. 

'''
class BinaryRelevance():
    def __init__(self, classifier, class_labels):
        # pass in a classifier object
        # the number of class labels is necessary so that we know where to start slicing from 
        self.classifier = classifier
        self.class_labels = class_labels # should be a list
        self.models = {}
        self.labels_series = pd.Series(self.class_labels)
         
    
    def train(self, data_to_train):
        # first step is simply to pass in a subset of the data. 
        # We just want one batch of the data and another batch
        '''
        Step 1: 
            For each class label we have to strip away the others, and make a model with the class label.
            We're ultimately going to be using an aggregation of these models, so we store the models that 
            we create in the models dictionary.
        ''' 
        # the features are all the columns that are not the class labels
        features = data_to_train[data_to_train.columns[~data_to_train.columns.isin(self.labels_series)]]
        
        # for each class label we create a new model
        for class_label in self.class_labels:
            # the model is stored in self.models
            self.models[class_label] = self.classifier()
            # we select the class label column as our y
            y = data_to_train[[class_label]]
            # we train the model
            self.models[class_label].fit(features, y)
    
#     # I'm not sure if a test method is necessary. I left this here because I was working on it, 
#.       but you can test the data simply by comparing the prediction with the test data set.
#     def test(self, data_to_test):
#         features = data_to_test[data_to_test.columns[~data_to_test.columns.isin(self.labels_series)]]
#         for class_label in self.class_labels:
#                 model = self.models[class_label]
#                 prediction = model.predict(features)
#                 print(prediction)
                

    # Inputs: this method receives a dataframe WITHOUT class labels and returns a dataframe with the class labels predicted
    def predict(self, features):
            return_frame = features
            
            for class_label in self.class_labels:
                model = self.models[class_label] # select the appropriate model from the dictionary
                prediction = model.predict(features) # predict with sklearn's classifier
                prediction_frame = pd.DataFrame(data=prediction, columns=[class_label]) # create a new df with the prediction
                return_frame = return_frame.reset_index(drop=True) # reset index
                prediction_frame = prediction_frame.reset_index(drop=True) # reset index
                return_frame = pd.concat([return_frame, prediction_frame], axis=1) # concatenate the class label with the features
            return return_frame
            
                


### Usage

In [10]:
train, test = np.split(data, [int(.7*len(data))]) # I split the dataframe into 70% train and 30% test
train # 1691 rows
test # 726 rows


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
1691,0.098609,0.021142,0.114095,-0.019634,0.067450,-0.033799,-0.079402,0.008213,0.028437,0.033689,...,0,0,0,1,1,0,0,0,0,0
1692,-0.012358,0.128307,0.166101,0.021330,0.063760,-0.029857,-0.032447,0.090522,-0.024394,-0.036240,...,0,1,1,0,0,0,0,0,0,0
1693,0.004204,0.059006,-0.001850,0.047904,-0.042538,0.076276,-0.055510,0.081995,0.061146,0.130098,...,1,0,0,0,0,0,0,1,1,0
1694,-0.040390,0.386083,-0.111378,-0.196421,-0.104228,-0.204434,-0.033160,-0.054812,-0.072045,-0.113054,...,1,1,0,0,0,0,0,1,1,0
1695,0.014664,0.349167,0.080306,0.147173,0.090360,0.014918,-0.050594,0.122519,0.039955,0.037153,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


In [11]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB # our test classifier
class_labels = ['Class1', 'Class2', 'Class3', 'Class4', 'Class5', 'Class6', 'Class7', 'Class8', 'Class9', 'Class10', 'Class11', 'Class12', 'Class13', 'Class14']
br_classifier = BinaryRelevance(GaussianNB, class_labels) 

In [12]:
# train
br_classifier.train(train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
features = test[test.columns[~test.columns.isin(class_labels)]]
new_df = br_classifier.predict(features)
new_df

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.098609,0.021142,0.114095,-0.019634,0.067450,-0.033799,-0.079402,0.008213,0.028437,0.033689,...,0,1,1,1,0,1,1,0,0,1
1,-0.012358,0.128307,0.166101,0.021330,0.063760,-0.029857,-0.032447,0.090522,-0.024394,-0.036240,...,0,1,1,1,0,1,1,0,0,0
2,0.004204,0.059006,-0.001850,0.047904,-0.042538,0.076276,-0.055510,0.081995,0.061146,0.130098,...,1,0,0,0,0,0,0,1,1,0
3,-0.040390,0.386083,-0.111378,-0.196421,-0.104228,-0.204434,-0.033160,-0.054812,-0.072045,-0.113054,...,0,1,1,1,1,1,1,0,0,0
4,0.014664,0.349167,0.080306,0.147173,0.090360,0.014918,-0.050594,0.122519,0.039955,0.037153,...,0,0,0,0,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,1,1,1,0,0,0,0,0
722,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,1,1,1,0
723,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,0,0,1,1,0,0,0
724,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,0,0,0


In [14]:
test

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
1691,0.098609,0.021142,0.114095,-0.019634,0.067450,-0.033799,-0.079402,0.008213,0.028437,0.033689,...,0,0,0,1,1,0,0,0,0,0
1692,-0.012358,0.128307,0.166101,0.021330,0.063760,-0.029857,-0.032447,0.090522,-0.024394,-0.036240,...,0,1,1,0,0,0,0,0,0,0
1693,0.004204,0.059006,-0.001850,0.047904,-0.042538,0.076276,-0.055510,0.081995,0.061146,0.130098,...,1,0,0,0,0,0,0,1,1,0
1694,-0.040390,0.386083,-0.111378,-0.196421,-0.104228,-0.204434,-0.033160,-0.054812,-0.072045,-0.113054,...,1,1,0,0,0,0,0,1,1,0
1695,0.014664,0.349167,0.080306,0.147173,0.090360,0.014918,-0.050594,0.122519,0.039955,0.037153,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


## Task 2: Implement the Binary Relevance Algorithm with Under-Sampling
Our objective is to balance the class distribution for each label. We can do this by:  
1) Evaluating the class distribution for each class  
2) If the classes are imbalanced, we can remove a subset of the data to balance them. 

To enact undersampling, I'm going to create a subclass of the BinaryRelevance class, called BinaryRelevanceWithUnderSampling. This class will have a method - balance_classes_and_train. This method will:
- assess each label and the distribution of classes
- produce a balanced dataframe 
- train a model
- revert back to the original "full" dataframe and repeat so as to keep as much data available as possible.

In [15]:
class BinaryRelevanceWithUnderSampling(BinaryRelevance):
    def __init__(self, classifier, labels):
        super().__init__(classifier, labels)
    
    
    def balance_classes_and_train(self, data_to_train):
        
    


## Task 3: Compare the Performance of Different Binary Relevance Approaches



In [55]:
'''
Evaluation of multi-label classifiers
-------------------------------------

Approaches we can take: 
- micro-averaging
    - for each label, find TPs, TNs, FPs and FNs
    - using the sums of those, derive an aggregated accuracy, precision, recall and F1 metric. 
- macro-averaging
    - Use the entire prediction set to determine accuracy and other metrics
- hamming loss
    - The fraction of incorrect labels to total labels. 

'''



from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss




0.3018496654860291

## Hamming Loss

In [56]:
# Binary Relevance: 
test_class_label_results = test.loc[:, 'Class1':]
prediction_class_label_results = new_df.loc[:, 'Class1':]
test_class_label_results
prediction_class_label_results.reset_index()
test_class_label_results = test_class_label_results.reset_index()
prediction_class_label_results
test_class_label_results = test_class_label_results.drop("index", axis=1)
test_class_label_results
hamming_loss(test_class_label_results, prediction_class_label_results)
# 0.302 - not great.

0.3018496654860291

## Task 4: Implement the Classifier Chains Algorithm

In [6]:
# Write your code here


## Task 5: Evaluate the Performance of the Classifier Chains Algorithm

In [7]:
# Write your code here


## Task 6: Reflect on the Performance of the Different Models Evaluated

*Write your reflection here (max 300 words)*