# Annotate merged single cells with metadata from platemap file for each plate

## Import libraries

In [1]:
import sys
import pathlib
import os
import yaml
import json

import pandas as pd
from pycytominer import annotate
from pycytominer.cyto_utils import output

sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# output directory for annotated data
output_dir = pathlib.Path("./data/annotated_data")
# if directory if doesn't exist, will not raise error if it already exists
os.makedirs(output_dir, exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# add paths to dictionary that are used for annotation
plate_info_dictionary["Plate_1"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate1.csv"))
plate_info_dictionary["Plate_2"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate2.csv"))
# both plates 3 and 3 prime use the same platemap file (same metadata)
plate_info_dictionary["Plate_3"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate3.csv"))
plate_info_dictionary["Plate_3_prime"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate3.csv"))

# view the dictionary to assess that all info is added correctly
print(json.dumps(plate_info_dictionary, indent=4))

{
    "Plate_1": {
        "annotated_path": "data/annotated_data/Plate_1_sc.parquet",
        "dest_path": "data/converted_data/Plate_1.parquet",
        "normalized_path": "data/normalized_data/Plate_1_sc_norm.parquet",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate1.csv",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite"
    },
    "Plate_2": {
        "annotated_path": "data/annotated_data/Plate_2_sc.parquet",
        "dest_path": "data/converted_data/Plate_2.parquet",
        "normalized_path": "data/normalized_data/Plate_2_sc_norm.parquet",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate2.csv",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite"
    },
    "Plate_3": {
        "annotated_path": "data/annotated_data/Plate_3_sc.parquet",
        "dest_path": "data/converted_data/Plate_3.parquet",
        "normalized_path": "data/normalized_data/Plate_3_sc_norm.parquet"

## Annotate merged single cells

In [3]:
for plate, info in plate_info_dictionary.items():
    # single_cell_df is the dataframe loaded in from the converted parquet file
    single_cell_df = pd.read_parquet(info["dest_path"])
    platemap_df = pd.read_csv(info["platemap_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc.parquet"))
    # save path to annotated file to dictionary for downstream use
    plate_info_dictionary[plate]["annotated_path"] = output_file
    print(f"Adding annotations to merged single cells for {plate}!")

    # add metadata from platemap file to extracted single cell features
    annotated_df = annotate(
        profiles=single_cell_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
    )

    # move metadata well and single cell count to the front of the df (for easy visualization in python)
    well_column = annotated_df.pop("Metadata_Well")
    singlecell_column = annotated_df.pop("Metadata_number_of_singlecells")
    # insert the column as the second index column in the dataframe
    annotated_df.insert(1, "Metadata_Well", well_column)
    annotated_df.insert(2, "Metadata_number_of_singlecells", singlecell_column)

    # save annotated df as parquet file
    output(
        df=annotated_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Annotations have been added to {plate} and saved!")

Adding annotations to merged single cells for Plate_1!
Annotations have been added to Plate_1 and saved!
Adding annotations to merged single cells for Plate_2!
Annotations have been added to Plate_2 and saved!
Adding annotations to merged single cells for Plate_3!
Annotations have been added to Plate_3 and saved!
Adding annotations to merged single cells for Plate_3_prime!
Annotations have been added to Plate_3_prime and saved!


In [4]:
# print last annotated df to see if annotation occurred
print(annotated_df.shape)
annotated_df.head()

(14495, 1597)


Unnamed: 0,Metadata_WellRow,Metadata_Well,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_ImageNumber,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,B,B1,42,1,NF1,WT,500,78,1,1,...,2278.660622,2216.518209,478.722598,449.485799,464.175505,475.434688,74.009363,72.723123,76.103825,76.127622
1,B,B1,42,1,NF1,WT,500,81,2,2,...,720.731162,722.051788,121.54087,129.189742,124.312256,123.659245,194.556687,194.525087,188.947644,189.180229
2,B,B1,42,1,NF1,WT,500,82,1,1,...,2464.118189,2568.315137,591.215759,470.246357,453.591037,455.722645,161.361597,148.654973,145.224151,148.543595
3,B,B1,42,1,NF1,WT,500,82,2,2,...,2886.052228,3117.874708,572.970287,539.569923,537.033409,476.209479,162.939002,160.198123,164.586236,155.469083
4,B,B1,42,1,NF1,WT,500,83,1,1,...,725.579676,707.478879,305.197153,305.705155,294.205789,301.477588,97.254067,95.867598,95.624585,95.350038


## Write updated dictionary to yaml file for use in downstream steps

In [5]:
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)