# Selection of data for the Binary model
With the binary model we want to predict if the protein is in a specific (not so frequent) location or not. Therefor we need to collect all the pictures the label include this specific location and the same number of pictures, where the label include it. 

In [68]:
import pandas as pd
import numpy as np


In [9]:
#get all of the labels
labels_training = pd.read_csv('../data/human-protein-atlas-image-classification_data/train.csv')

#### Class for getting a list with the names of the pictures which was labelled with a given location

In [70]:
class Location_in_Target(object):
    ''' Class to get a list of name of pictures which are labeled with a specific 
        location and the same number of names of pictures which labels do not 
        contain the specific location.
        ...
        Attributes:
        ----------
        location: int 
            number which define a location in a human cell
        pictures: panda.DataFrame
            list of the names of the pictures labeled with the location and 
            same number of pictures without the label
        
        Methods:
        ----------
        save_pictures(pictures, column_name):
            Saves the list of picture names in an Attribute
        get_pictures()
            Returns the list of picture names
        determine_pictures(labels)  
            Find pictures with and without the specific label and call save method
            when list is build
    '''

    def __init__(self, location): 
        '''Parameters
           ----------
           location: int 
                number which define a location in a human cell
        '''

        self.location = location
        self.pictures = pd.DataFrame()

    def save_pictures(self, pictures, column_name):
        ''' Saves the list of pictures in the attribute pictures

            Parameters
            ----------
            pictures: panda.DataFrame
                list of the names of the pictures labeled with the location and 
                same number of pictures without the label
            column_name: str
                Description of the data saved in the DataFrame used as column names 
        '''

        self.pictures[column_name] = pictures

    def get_pictures(self):
        '''Gives back the list of picture names which are labeled or not labeled
            with the given location
        '''

        return self.picture

    def determine_pictures(self, labels):
        ''' First the names of the pictures labeled with the given location are 
            saved in a list and the method to save it in a DataFrame column is 
            called. 
            The same number of names from pictures not labeled with the location 
            are chosen randomly. The saving function is called with it again. 

            Parameters
            ----------
            labels: DataFrame with all picture names and Targets       
        '''

        pictures = [label.Id  for index, label in labels.iterrows() 
                              if str(self.location) in label.Target]
        self.save_pictures(pictures, f'pictures_with_location_{self.location}')
        #The number of pictures labeled with the given location is needed to get the 
        #same number of pictures not labeled with the location
        number_needed_pictures= len(pictures)
        pictures = [label.Id  for index, label in labels.iterrows() 
                              if str(self.location) not in label.Target]
        self.save_pictures(np.random.choice(pictures, number_needed_pictures),
                                f'pictures_without_location_{self.location}')

#### Test the class with Mitochondria labeled with a 23

In [69]:
mitochondria_pictures = Location_in_Target(location = 23)
mitochondria_pictures.determine_pictures(labels_training)
mitochondria_pictures.get_pictures().head()

Unnamed: 0,pictures_with_location_23,pictures_without_location_23
0,002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0,d9045d20-bbb8-11e8-b2ba-ac1f6b6435d0
1,00344514-bbc2-11e8-b2bb-ac1f6b6435d0,87b77dd2-bba2-11e8-b2b9-ac1f6b6435d0
2,004efaa2-bba5-11e8-b2ba-ac1f6b6435d0,6978e83a-bbbe-11e8-b2ba-ac1f6b6435d0
3,00626a32-bbab-11e8-b2ba-ac1f6b6435d0,1b9a8c16-bbbb-11e8-b2ba-ac1f6b6435d0
4,0062719a-bbbe-11e8-b2ba-ac1f6b6435d0,0a6eb934-bbb7-11e8-b2ba-ac1f6b6435d0
