## User input

In [1]:
path_to_image_files = '../data/train/'

image_file_name = 'ffc30612-bbbf-11e8-b2bb-ac1f6b6435d0' # without e.g. '_green.png'

model_dir = '../models/'

In [None]:
!! to do: take in path to image files, drop endings and get all unique filenames, loop through them



## Load packages and functions

In [7]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class predict_protein_location(object):
    ''' Class to predict the location(s) of proteins in a given image set (3 colors)
        ...
        Attributes:
        ----------
        path: str 
            path to folder containing image files (ends with '/')
        filename: str
            the base of the image filenames, i.e., without e.g. '_green.png'
        model_dir: str
            path to the folder containing the pre-trained binary models (ends with '/')

        Methods:
        ----------
        read_and_combine_files():
            read and combine image files and return 3 layer array as attribute
        get_embeddings()
            run prediction with pre-trained model and return embeddings as attribute
        get_location_names()
            add the protein location names as an attribute
        load_binary_models(labels)  
            load the binary models and return a list of them as attribute
        predict_w_binary_models()
            return binary model predictions as an attribute
    '''

    def __init__(self, path, filename, model_dir): 
        '''Parameters
           ----------
           location: int 
                number which define a location in a human cell
        '''
        from efficientnet.tfkeras import EfficientNetB0
        #import tensorflow as tf

        self.path_filename = path + filename
        self.image_colors_to_include = ['_red.png', '_blue.png', '_green.png'] # colors to be used in analysis
        self.model_dir = model_dir+'*'
        self.embedding_model = EfficientNetB0(weights='imagenet', include_top=False, pooling="avg")
        #self.embedding_model = tf.keras.applications.EfficientNetB0(weights='imagenet', include_top=False, pooling="avg")

        self.read_and_combine_images()
        self.get_embeddings()
        self.get_location_names()
        self.load_binary_models()
        self.predict_w_binary_models()

    def read_and_combine_images(self):
        """reading in a png image file (all 3 colors) and combine into one array

        Args:
            path_to_image_files (string): path to location of images
            image_file_name (string): filename of the images, without the trailing e.g. "_green.png"
            image_colors_to_include(list): list of ends of filenames e.g. ['_red.png', '_blue.png', '_green.png'] 

        Returns:
            array: array containing all 3 images as different layers
        """
        # 
        import tensorflow as tf
        try:
            im_array = tf.concat([tf.io.decode_png(tf.io.read_file(self.path_filename+end_str)) 
                                    for end_str in self.image_colors_to_include], axis=2)        
            im_array = tf.image.convert_image_dtype(im_array, tf.float32)
        except FileNotFoundError:
            print("Couldn't read file  {}".format(path_to_image_files+image_file_name+end_str))
            #continue
        if len(im_array.shape) < 4: im_array = tf.expand_dims(im_array, axis=0)
        self.im_array = im_array

    def get_embeddings(self):
        """run prediction with pre-trained model (EfficientNetB0) to get embeddings
        """
        self.embeddings = self.embedding_model.predict(self.im_array, verbose=1)

    def get_location_names(self):
        """create a list of the name and number of each protein location
        Args:
        Returns:
            label_names: a dictionary of protein location numbers and names
        """
        # courtesy of https://www.kaggle.com/code/allunia/protein-atlas-exploration-and-baseline
        self.label_names = {
            0:  "Nucleoplasm",  
            1:  "Nuclear membrane",   
            2:  "Nucleoli",   
            3:  "Nucleoli fibrillar center",   
            4:  "Nuclear speckles",
            5:  "Nuclear bodies",   
            6:  "Endoplasmic reticulum",   
            7:  "Golgi apparatus",   
            8:  "Peroxisomes",   
            9:  "Endosomes",   
            10:  "Lysosomes",   
            11:  "Intermediate filaments",   
            12:  "Actin filaments",   
            13:  "Focal adhesion sites",   
            14:  "Microtubules",   
            15:  "Microtubule ends",   
            16:  "Cytokinetic bridge",   
            17:  "Mitotic spindle",   
            18:  "Microtubule organizing center",   
            19:  "Centrosome",   
            20:  "Lipid droplets",   
            21:  "Plasma membrane",   
            22:  "Cell junctions",   
            23:  "Mitochondria",   
            # 24:  "Aggresome",   
            # 25:  "Cytosol",   
            # 26:  "Cytoplasmic bodies",   
            # 27:  "Rods & rings"
        }   

    def load_binary_models(self):
        """loading the pre-trained binary model for each protein location, and its
        corresponding location number
        """
        # 
        import joblib, glob
        self.models = [joblib.load(f)[-1] for f in glob.glob(self.model_dir)] 
        # get the corresponding location number of each model
        self.model_location = [f.split('_')[-1] for f in glob.glob(self.model_dir)] 

    def predict_w_binary_models(self):
        """predicting the protein location of an image, using embeddings, and return all locations
        with a probability > 0.99
        Args:
        Returns:
            predictions_pd: a dataframe containing the protein location name and number and the probability that a protein
            is in that location
            predicted_locations: list of all predicted protein locations for this image (prob > 0.99)
        """
        import pandas as pd
        # create a dataframe containing location name, number and probability that it is in the image
        self.predictions_pd =  pd.DataFrame({'protein_location': [int(i) for i in self.model_location],
                                            'prediction_probability': [f.predict_proba(self.embeddings)[0][1] for f in self.models] })
        self.predictions_pd = self.predictions_pd.sort_values('protein_location').reset_index(drop=True) # sort to match location names
        self.predictions_pd.insert(0, 'protein_location_names', self.label_names.values()) # add location name as first column
        # get the predicted locations
        self.predicted_locations = self.predictions_pd.query('prediction_probability > 0.99').protein_location.ravel() 
    



## Run it

In [8]:
test = predict_protein_location(path_to_image_files, image_file_name, model_dir)
test.predictions_pd

2022-07-13 17:30:59.739410: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




Unnamed: 0,protein_location_names,protein_location,prediction_probability
0,Nucleoplasm,0,0.05503787
1,Nuclear membrane,1,0.9999762
2,Nucleoli,3,1.0
3,Nucleoli fibrillar center,4,1.0
4,Nuclear speckles,5,9.360457e-05
5,Nuclear bodies,6,0.9999991
6,Endoplasmic reticulum,7,0.0008580441
7,Golgi apparatus,8,1.302969e-09
8,Peroxisomes,9,1.0
9,Endosomes,11,1.0
