# Load and Process Single Cell Data

This notebook will load, normalize, and perform feature selection on a single plate of data from the Resistance Mechanisms project. The data are single cell morphology profiles.

This data represent the first data type input into the technical artifacts experiment.

In [1]:
import sys
import pathlib
import sqlite3
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import output

In [2]:
# Load functions modified from:
# https://github.com/broadinstitute/profiling-resistance-mechanisms/blob/5f0ab0035705836af0438e84dd7c336fc566a015/4.single-cell/utils/single_cell_utils.py
def load_compartment(compartment, connection):
    query = f"select * from {compartment}"
    df = pd.read_sql_query(query, connection)
    return df


def prefilter_features(df, flags):
    remove_cols = []
    for filter_feature in flags:
        remove_cols += [x for x in df.columns if filter_feature in x]
    remove_cols = list(set(remove_cols))
    return remove_cols

In [3]:
project_tag = "2018_05_30_ResistanceMechanisms_Kapoor"
workspace_dir = f"/home/ubuntu/bucket/projects/{project_tag}/workspace/"

batch = "2019_02_15_Batch1_20X"
plate = "HCT116bortezomib"

sqlite_file = pathlib.Path(f"{workspace_dir}/backend/{batch}/{plate}/{plate}.sqlite")

In [4]:
feature_select_opts = [
    "variance_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers",
]
corr_threshold = 0.8
na_cutoff = 0

In [5]:
batch_dir = pathlib.Path(workspace_dir, "backend", batch)
metadata_dir = pathlib.Path(workspace_dir, "metadata", batch)

barcode_plate_map_file = pathlib.Path(metadata_dir, "barcode_platemap.csv")
barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)

barcode_plate_map_df

Unnamed: 0,Assay_Plate_Barcode,Plate_Map_Name,Batch_Number
0,HCT116bortezomib,PlateMap_HCT116bortezomib,1


In [6]:
plate_map_name = (
    barcode_plate_map_df
    .query("Assay_Plate_Barcode == @plate")
    .Plate_Map_Name
    .values[0]
)

plate_map_file = pathlib.Path(metadata_dir, "platemap", f"{plate_map_name}.txt")
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
plate_map_df.columns = [x if x.startswith("Metadata_") else f"Metadata_{x}" for x in plate_map_df.columns]
plate_map_df.head()

Unnamed: 0,Metadata_plate_map_name,Metadata_well_position,Metadata_CellLine,Metadata_Dosage
0,PlateMap_HCT116bortezomib,B03,WT,0.0
1,PlateMap_HCT116bortezomib,B04,WT,0.0
2,PlateMap_HCT116bortezomib,B05,WT,0.0
3,PlateMap_HCT116bortezomib,B06,CloneA,0.0
4,PlateMap_HCT116bortezomib,B07,CloneA,0.0


## Setup Connection to SQlite file

In [7]:
conn = sqlite3.connect(sqlite_file)

In [8]:
image_cols = f"TableNumber, ImageNumber, Metadata_Plate, Metadata_Well"
image_query = f"select {image_cols} from image"
image_df = (
    pd.read_sql_query(image_query, conn)
    .merge(
        plate_map_df,
        left_on="Metadata_Well",
        right_on="Metadata_well_position"
    )
    .drop(["Metadata_well_position"], axis="columns")
)

print(image_df.shape)
image_df.head()

(324, 7)


Unnamed: 0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage
0,129720723511626657887512435674571781382,1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0
1,116132684245721901228619120141333231082,37,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0
2,286745203734055605567188434419624864775,73,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0
3,246692378387698541245451908220284470699,109,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0
4,264273443973354264124525475035660684704,145,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0


## Load compartment data

In [9]:
cells_df = load_compartment("cells", conn)
cytoplasm_df = load_compartment("cytoplasm", conn)
nuclei_df = load_compartment("nuclei", conn)

## Merge compartment data

In [10]:
# Merge tables
merged_df = cells_df.merge(
    cytoplasm_df,
    left_on=["TableNumber", "ImageNumber", "ObjectNumber"],
    right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
    how="inner",
).merge(
    nuclei_df,
    left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
    right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
    how="inner",
)

## Filter features

In [11]:
feature_filter_flags = ["Object", "Location", "Count", "Parent"]
drop_features = prefilter_features(merged_df, feature_filter_flags)

merged_df = merged_df.drop(drop_features, axis="columns")

# Merge with the image information
merged_df = image_df.merge(
    merged_df, on=["TableNumber", "ImageNumber"], how="right"
)

print(merged_df.shape)
merged_df.head()

(383098, 3422)


Unnamed: 0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,129720723511626657887512435674571781382,1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,1168,1830.0,0.0,...,0.872203,1.009521,0.0,0.0,0.0,0.0,0.901371,0.850106,0.876822,0.999048
1,129720723511626657887512435674571781382,1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,2135,337.0,26.0,...,2.506542,2.802913,3.372126,2.438606,3.056469,4.551637,2.676828,2.437096,2.568605,2.596497
2,129720723511626657887512435674571781382,1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,2624,1220.0,0.0,...,5.430472,6.379557,2.799086,6.799375,7.888809,8.463018,5.614301,4.88388,4.74402,4.900678
3,129720723511626657887512435674571781382,1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,316,1813.0,27.0,...,1.555363,1.756173,0.0,0.0,0.0,0.0,0.844085,1.147251,1.115915,1.089446
4,129720723511626657887512435674571781382,1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,369,807.0,26.0,...,1.29654,1.384634,0.0,0.0,0.0,0.0,1.568844,1.581597,1.515306,1.400407


## Apply normalization, feature select, and output data

In [12]:
normalized_df = normalize(
    merged_df,
    features="infer",
    meta_features="infer",
    samples="all",
    method="standardize"
)

In [13]:
feature_select_df = feature_select(
    normalized_df,
    features="infer",
    operation=feature_select_opts,
    output_file="none",
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
)

print(feature_select_df.shape)
feature_select_df.head()

(383098, 691)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_Orientation,Cells_AreaShape_Solidity,...,Nuclei_Texture_SumEntropy_ER_5_00,Nuclei_Texture_SumEntropy_ER_5_01,Nuclei_Texture_SumEntropy_ER_5_02,Nuclei_Texture_SumEntropy_ER_5_03,Nuclei_Texture_SumEntropy_RNA_10_01,Nuclei_Texture_SumEntropy_RNA_10_03,Nuclei_Texture_SumEntropy_RNA_5_00,Nuclei_Texture_SumEntropy_RNA_5_01,Nuclei_Texture_SumEntropy_RNA_5_02,Nuclei_Texture_SumEntropy_RNA_5_03
0,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,1.007319,0.010347,-0.089296,0.207129,0.659845,...,-2.498233,-2.679183,-2.477797,-2.299643,-3.784741,-2.563536,-2.977224,-2.818551,-2.606318,-2.714702
1,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,-0.827385,1.5007,1.702837,-0.772167,1.529059,...,-0.346323,-0.211698,-0.132987,-0.470213,-1.335958,-1.314276,-1.62362,-1.607276,-1.498935,-1.580054
2,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,1.150905,2.239621,0.127975,-0.022297,0.579816,...,-0.494985,-0.527462,-0.812416,-0.817195,-0.912815,-0.658055,-0.937983,-1.024063,-1.10872,-1.132349
3,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,-1.631927,0.867117,2.104695,-0.182995,1.051667,...,-1.759445,-2.319458,-2.195132,-1.828297,-3.064756,-2.607123,-2.504772,-2.482257,-2.463292,-2.452835
4,HCT116bortezomib,B03,PlateMap_HCT116bortezomib,WT,0.0,-0.041428,1.519934,2.893819,-0.200788,1.282411,...,-1.357478,-1.440776,-1.479136,-1.0436,-2.924492,-1.666507,-2.08554,-1.907581,-2.386904,-2.445377


In [14]:
output_filename = pathlib.Path(
    f"data/{batch}/{plate}_singlecell_normalized_feature_select.csv.gz"
)
output(normalized_df, output_filename, compression="gzip", float_format="%.5g" )