# Use Annotation Masks to Analyze Image Content

Author: Geneva Miller, Van Valen lab

The first part of this notebook allows users to apply segmentation masks to microscope images, extract information about the raw image, and save information about the raw image to a CSV file.

After that, users can visualize their data with the plots below.

For a given image, the number of cells will be recorded (each cell represented by a row in the CSV file). Information recorded about each cell will include: 
- filename of the image the cell was in
- size of cell (number of pixels it occupies)
- minimum intensity value within cell
- maximum intensity value within cell
- average intensity value of cell
- standard deviation of intensity values of cell

but more information can be added if needed.

"Cell 0" is the background of each image, and should be excluded from plots. (This is also why cell 0 is likely to be quite larger than the other cells.)

In [None]:
#imports
import re
import os
import stat
import sys
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go

from imageio import imread
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
#from deepcell_toolbox.utils.io_utils import get_img_names

def get_img_names(direc_name):
    """Return all .png or .tif image filenames in direc_name as sorted list
       
        Args:
            direc_name: Full path to directory containing images
           
        Returns:
            Sorted list of image names
    """
    imglist = os.listdir(direc_name)
    imgfiles = [i for i in imglist if ".tif" in i or ".png" in i]
    imgfiles = sorted_nicely(imgfiles)
    return imgfiles

def sorted_nicely(l):
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

### Set variables

In [None]:
#base_dir contains image folders of interest, and is where analysis output will be saved (in "CSV" subfolder)
base_dir = "/gnv_home/data/analysis_test/set2"

#if you had other channels for your data, you would list those folders here
#will add in option to extract channel info from a single image file later
raw_folders = ['raw_chopped_4_4']
#raw_folders = ["phase", "FITC", "mCherry", "Cy5"]

annotation_folder ="annotations_renamed"

#identifier is something that is used in our other image processing pipelines
identifier = "analysis_example" #this will become part of the csv filename


#if you were going to run compile_flat_dicts_subfolders you would also need to include these variables:

#parts = 8
#num_x = 4
#num_y = 4
#parent_dir = "/gnv_home/example/movies"

### Functions to extract data from images
Basically all of these functions add information to dictionaries. Some of these functions flatten the dictionaries so they can be easily turned into dataframes (for plotting data in the notebook) and saved as .csv files; some of the functions are wrappers that will extract data from folders of images.


In [1]:
#function definitions

def find_cell_locations(annotation_path):
    '''Reads a single annotated image to find locations in image of all unique labels.
    
        Args:
            annotation_path: Full path to annotated image
            
        Returns:
            A dictionary where each key corresponds to the cell mask value in the image, and each value
            contains the location information for that cell mask.
    '''    
    
    annotation = imread(annotation_path)
    cell_locations = {}
    cell_list = np.unique(annotation)
    for cell in cell_list:
        cell_locations[cell] = {}
        location = np.nonzero(annotation == cell)
        cell_locations[cell]["location"] = location
        cell_locations[cell]["size"] = len(location[0])
    return cell_locations


def get_annotation_info(annotation_dir, img_num):
    '''Reads a single annotated image to get location and metadata about cell annotations.
    
        Args:
            annotation_dir: Full path to directory containing annotated images
            image_num: index of image position within annotation directory
            
        Returns:
            A dictionary containing the file name of the image that data was extracted from, the number of
            cells in the image, and location data for each cell in the image.
    '''
    
    annotation_names = get_img_names(annotation_dir)
    annotation_name = annotation_names[img_num]
    annotation_path = os.path.join(annotation_dir, annotation_name)
    
    annotation_dict = {}
    annotation_dict["file_name"] = annotation_name
    annotation_dict["cells"] = find_cell_locations(annotation_path)
    annotation_dict["num_cells"] = len(annotation_dict["cells"])
    
    return annotation_dict


def apply_cell_locations(raw_image_path, cell_locations):
    '''Uses annotation mask locations to get pixel intensity information from a single image.
    
        Args:
            raw_image_path: Full path to an image to apply annotation masks to
            cell_locations: Dictionary containing pixel locations for each unique cell mask in an annotation
            (output of find_cell_locations)
            
        Returns:
            A dictionary containing pixel intensity information for each unique cell in a raw image
            (as determined by annotation mask).
    '''    
    
    raw_img = imread(raw_image_path)
    cell_intensity_info = {}
    
    for cell in cell_locations:
        cell_intensity_info[cell] = {}
        
        intensities = raw_img[cell_locations[cell]["location"]]
        
        cell_intensity_info[cell]["intensities"] = intensities
        cell_intensity_info[cell]["min"] = np.min(intensities)
        cell_intensity_info[cell]["max"] = np.max(intensities)
        cell_intensity_info[cell]["avg"] = np.average(intensities)
        cell_intensity_info[cell]["stdev"] = np.std(intensities)
        
    return cell_intensity_info


def get_channel_info(channel_dir, img_num, cell_locations):
    '''Uses annotation mask locations to get pixel intensity information from a single image.
    
        Args:
            annotation_dir: Full path to directory containing raw images from one channel
            image_num: index of image position within channel directory
            cell_locations: Dictionary containing pixel locations for each unique cell mask in an annotation
            (output of find_cell_locations)
            
        Returns:
            A dictionary containing pixel intensity information for each unique cell in a raw image
            (as determined by annotation mask), as well as the file name of the image that intensities
            were extracted from.
    '''    
        
    raw_names = get_img_names(channel_dir)
    raw_name = raw_names[img_num] 
    raw_path = os.path.join(channel_dir, raw_name)
    
    channel_dict = {}
    channel_dict["file_name"] = raw_name
    channel_dict["cells"] = apply_cell_locations(raw_path, cell_locations)
    
    return channel_dict


def get_all_info_folders(base_dir, annotation_folder, raw_folders, split = True):
    """Gets all the raw data about annotation masks and applied masks for images from folders
    (does not extract data from multiple annotation folders). 

        Args:
            base_dir: Full path to directory that that contains the subfolders for annotations and raw images.
            This directory is also used in other functions to save data in a "CSV" subfolder.
            annotation_folder: Name of the folder that contains annotated images (not the full path).
            Usually "annotated" or "annotations".
            raw_folders: List of names of folders that contain unprocessed images (not full paths). Usually
            ["raw"] if there is only one channel of information, but may be a list of different channels to
            apply masks to, ie ["FITC", "TRITC"] or ["C1", "C2"]. Masks will be applied to all channels listed.
            Raw folders and annotation folder should all contain the same number of images.
            split: Bool indicating if channel information should be extracted from different images, or
            from different channels in a multichannel image. Not yet supported
            
        Returns:
            A dictionary (full_dict) where each key is the image number that annotated and raw image data were
            extracted from, and each value is a dictionary containing raw data and some metadata about
            each image.
    """    
    
    #split determines if channels are combined in raw image, or saved separately (doesn't do anything for now)
    #annotation_folder and all raw_folders should contain the same number of images
        
    full_dict = {}
    
    #get cell locations from annotation masks
    annotation_dir = os.path.join(base_dir, annotation_folder)
    annotations = get_img_names(annotation_dir)
    
    #now get information from all raw image channels
    for img_num in range(len(annotations)):
        full_dict[img_num] = {}
        full_dict[img_num]["annotations"] = get_annotation_info(annotation_dir, img_num)
        cell_locations = full_dict[img_num]["annotations"]["cells"]
        
        full_dict[img_num]["channels"] = {}
        
        #get info from each channel
        for channel in raw_folders:
            channel_dir = os.path.join(base_dir, channel)
            full_dict[img_num]["channels"][channel] = get_channel_info(channel_dir, img_num, cell_locations)

    return full_dict
        
    
def make_flat_cell_info(full_dict):
    '''Takes dictionary of raw data output from get_all_info_folders (full_dict), calculates
    summary statistics of the data, and puts into a dictionary format that can easily be
    converted to a pandas dataframe. (Should combine with get_all_info_folders.)
    
        Args:
            full_dict: Output of get_all_info_folders
            
        Returns:
            A dictionary where each key uniquely identifies a cell mask within an image, and each value
            is a dictionary containing summarized image data and some metadata.
    '''
    
    cell_info={}
    
    for img_num in full_dict:
        for cell_num in full_dict[img_num]['annotations']["cells"]:
            
            #make unique cell_id so each row contains the information about 
            #one cell across the different images it was in
            cell_id = "img_" + str(img_num).zfill(3) + "_cell_num_" + str(cell_num).zfill(3)
            cell_info[cell_id] = {}
            
            cell_info[cell_id]["annotation_name"] = full_dict[img_num]['annotations']["file_name"]
            
            cell_info[cell_id]["size"] = full_dict[img_num]['annotations']['cells'][cell_num]['size']

            for channel in full_dict[img_num]["channels"]:
                cell_info[cell_id][channel + "_name"] = full_dict[img_num]["channels"][channel]["file_name"]
                cell_info[cell_id][channel + "_avg"] = full_dict[img_num]["channels"][channel]["cells"][cell_num]["avg"]
                cell_info[cell_id][channel + "_stdev"] = full_dict[img_num]["channels"][channel]["cells"][cell_num]["stdev"]
                cell_info[cell_id][channel + "_min"] = full_dict[img_num]["channels"][channel]["cells"][cell_num]["min"]
                cell_info[cell_id][channel + "_max"] = full_dict[img_num]["channels"][channel]["cells"][cell_num]["max"]
                
    return cell_info
        

def compile_flat_dicts_subfolders(parent_dir, annotation_folder, raw_folders, parts, num_x, num_y):
    """Loops over a specific pattern of subfolders to extract image information from each and
    put in a dictionary. Used for analyzing images chopped out of montages (generated during
    Post-annotation for 3D data).

        Args:
            parent_dir: Full path to directory that contains the subfolders to loop over (different from
            base_dir used in other functions), usually "movies".
            annotation_folder: Name of the folder that contains annotated images (not the full path).
            Usually "annotated" or "annotations".
            raw_folders: List of names of folders that contain unprocessed images (not full paths). Usually
            ["raw"] if there is only one channel of information, but may be a list of different channels to
            apply masks to, ie ["FITC", "TRITC"] or ["C1", "C2"]. Masks will be applied to all channels listed.
            Raw folders and annotation folder should all contain the same number of images.
            parts: How many "part" subfolders to extract data from. (5 parts will loop over part0-part4.)
            num_x: How many columns the original image was chopped into to make montages.
            num_y: How many rows the original image was chopped into to make montages.
            
        Returns:
            A dictionary where each key is the name of the subfolder that image data was extracted from,
            and each value is a flattened dictionary of the information in that subfolder.
    """    
    
    many_dicts = {}

    for part in range(parts):
        for pos_x in range(num_x):
            for pos_y in range(num_y):
                sub_dir = os.path.join(parent_dir, "part{0}".format(part), "x_{0:02d}_y_{1:02d}".format(pos_x, pos_y))
                sub_dir_id = "part{0}_x_{1:02d}_y_{2:02d}".format(part, pos_x, pos_y)
                
                #get all the information
                full_dict = get_all_info_folders(sub_dir, annotation_folder, raw_folders)
                #make that dict flat before adding it to the larger dict
                flat_dict = make_flat_cell_info(full_dict)
            
                many_dicts[sub_dir_id] = flat_dict
                
    return many_dicts
                
                
def flatten_folder_dict(many_dicts):
    """Flattens the output of compile_flat_dicts_subfolders so the resulting dictionary can be converted
    into a dataframe with pandas.

        Args:
            many_dicts: Dictionary generated by compile_flat_dicts_subfolders
            (dictionary where each key corresponds to one subfolder of data, and each value is a dictionary
            of dictionaries containing the information extracted from that subfolder)
            
        Returns:
            A dictionary where each key is a unique identifier of subfolder name, image name, and cell
            id within that image, and each value is a dictionary containing cell-specific information
    """

    cell_info={}
    
    for folder_num in many_dicts:
        for cell_id in many_dicts[folder_num]:
            
            #make unique cell_id so each row contains the information about 
            #one cell across the different images it was in
            new_cell_id = folder_num + "_" + cell_id
            cell_info[new_cell_id] = {}
            
            cell_info[new_cell_id] = many_dicts[folder_num][cell_id]

    return cell_info


def no_background_dict(flat_dict):
    '''Removes information about image background from flat dictionary.
    
        Args:
            flat_dict: Flat dictionary with unique cell identifier in each row. Can be output of
            make_flat_folder_info or flatten_folder_dict.

        Returns:
            Flat dictionary with all information about "cell 0" (background) removed.
    '''
    
    no_bkgd_dict = {}
    for key in flat_dict:
        if "num_000" not in key:
            no_bkgd_dict[key] = flat_dict[key]
            
    return no_bkgd_dict


def make_save_df(flat_dict, base_dir, identifier, include_background):
    '''Creates a dataframe from a flattened dictionary and saves as a .csv file for future use.
    
        Args:
            flat_dict: Flat dictionary with unique cell identifier in each row. Can be output of
            make_flat_folder_info or flatten_folder_dict.
            base_dir: Full path to directory that contains a "CSV" subfolder to save .csv file in. "CSV"
            subfolder will be created in this directory if it does not exist.
            identifier: User-decided string intended for organization of datasets; should correspond to
            identifier in crowd annotation pipelines, if used. Becomes part of .csv file name.
            include_background: Bool indicating whether information about the background should be included
            in the dataframe.
            
        Returns:
            A dataframe containing the information from flat_dict.
    ''' 
    
    csv_dir = os.path.join(base_dir, "CSV")
    if not os.path.isdir(csv_dir):
        os.makedirs(csv_dir)
        #add folder permissions change
        
    csv_name = identifier + "_analysis_includes_background_info.csv"
    
    if not include_background:
        flat_dict = no_background_dict(flat_dict)
        csv_name = identifier + "_analysis_no_background_info.csv"
        
    csv_path = os.path.join(csv_dir, csv_name)    
    df = pd.DataFrame.from_dict(flat_dict, "index")
    df.to_csv(csv_path, index = True)
    
    return df


## Extract data from images and save csv

In [None]:
full_dict = get_all_info_folders(base_dir, annotation_folder, raw_folders, split = True)

#flatten the dictionary and don't keep some of the information (all the pixel locations and intensities)
flat_dict = make_flat_cell_info(full_dict)

In [None]:
#optional, display it as dataframe before saving
df = pd.DataFrame.from_dict(flat_dict, "index")
df

In [None]:
#save .csv file, return df for you to look at

df = make_save_df(flat_nuc_dicts, base_dir, identifier, include_background)

## Visualize data
### Plot information from columns of the dataframe
This notebook uses plotly because it makes cool, interactive graphs (and is supposed to be a lot more intuitive to use than matplotlib). Some samples of traces that I've used are provided. Since some column names in the dataframe depend on the channel names provided, it is recommended not to hard code in the column names when making new traces.

### From dictionary

In [None]:
csv_data = df

### From CSV

In [None]:
csv_path = "/gnv_home/data/analysis_test/set1/CSV/analysis_notebook_test_analysis.csv"
csv_data = pd.read_csv(csv_path)
csv_data

In [None]:
raw0= raw_folders[0]

In [None]:
brightness_size_scatter = {'x' : csv_data['size'], 'y' : csv_data['{0}_avg'.format(raw0)], 'mode' : "markers"}

ratio_scatter = {'x' : csv_data['size'], 'y' : csv_data['{0}_avg'.format(raw0)]/csv_data['{0}_max'.format(raw0)], 'mode' : 'markers'}

size_hist = {'type': 'histogram', 'x' : csv_data['size'], 'xbins': {'size' :50}}

intensity_hist = {'type': 'histogram', 'x' : csv_data['{0}_avg'.format(raw0)], 'xbins': {'size' :10}}

In [None]:
plotly.offline.iplot([
    intensity_hist
])