# Plate Normalization Notebook

## Set Directory

In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

## Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

from ast import literal_eval
from scipy.spatial.distance import cosine
from scipy.stats.stats import pearsonr   

import utils

## Configs

In [3]:
EXPERIMENT = 'HRCE-1'
PLATE_NUMBER = 20
PLATE = 'Plate'+str(PLATE_NUMBER)
SAVE_NORMALIZED_PROFILE = True

## Data Paths

In [4]:
DATA_PATH = '../data'

PLATE_PATH = os.path.join('..', 'features', EXPERIMENT, PLATE)
PROFILE_PATH = os.path.join(PLATE_PATH, 'profile_median_site.csv')
WELL_NORMALIZED_PROFILE = os.path.join(PLATE_PATH, 'normalized_well_profile.csv')
SITE_NORMALIZED_PROFILE = os.path.join(PLATE_PATH, 'normalized_site_profile.csv')

METADATA_PATH = os.path.join(DATA_PATH, 'metadata.csv')
EMBEDDING_FEATURES_PATH = os.path.join(DATA_PATH, 'embeddings', 'embeddings.csv')
MOA_PATH = os.path.join(DATA_PATH, 'MOA.csv')

## Normalization

### Load Dataset

In [5]:
metadata = pd.read_csv(METADATA_PATH)

In [6]:
profile = pd.read_csv(PROFILE_PATH, index_col=0)
np.shape(profile)

(5356, 3915)

### Prepare Metadata

In [7]:
def label(disease_condition):
    if disease_condition == 0: return 'negctrl'
    else: return 'posctrl'

def get_well_metadata(metadata, experiment, plate):
    metadata = metadata[(metadata.experiment == experiment) & (metadata.plate == plate)]
    metadata = metadata[['treatment', 'treatment_conc' , 'well_id', 'site_id','disease_condition']]
    metadata = metadata.fillna(0)
    metadata = metadata.assign(
        groupid = metadata.groupby(['treatment', 'treatment_conc', 'disease_condition']).ngroup())
    metadata['label'] = metadata.apply(lambda row: label(row.disease_condition), axis=1)
    return metadata

well_metadata = get_well_metadata(metadata, EXPERIMENT, PLATE_NUMBER)

## Site Level

In [8]:
site_profile = (
    well_metadata.merge(profile, on ='site_id')
    .drop_duplicates('site_id')
    .set_index('site_id')
)
np.shape(site_profile)

(5356, 3920)

In [9]:
print('Before drop na profile shape: {}'.format(np.shape(site_profile)))
site_profile = site_profile.dropna(axis=1)
print('After drop na profile shape: {}'.format(np.shape(site_profile)))

Before drop na profile shape: (5356, 3920)
After drop na profile shape: (5356, 3905)


In [10]:
def get_group_info(data):
    return (
        data[['label', 'groupid']]
        .groupby('groupid').count()
        .reset_index()
        .groupby('label')
        .count()
        .rename_axis('group_size')
        .rename({'groupid' : 'count'}, axis=1)
    )

print('Profile treatment-condition groups information:')
get_group_info(site_profile)

Profile treatment-condition groups information:


Unnamed: 0_level_0,count
group_size,Unnamed: 1_level_1
6,1
7,2
8,585
120,1
160,2
216,1


In [11]:
def split_features_and_labels(dataset, labels_view, features_dropout):
    labels = dataset.filter(items=labels_view)
    features = dataset.drop(features_dropout, axis=1)
    return features, labels

profile_site_features, profile_site_labels = split_features_and_labels(
    site_profile,
    labels_view=['groupid', 'label'],
    features_dropout=['groupid', 'label', 'treatment', 'treatment_conc', 'disease_condition', 'well_id']
)

In [12]:
def normalize_treatment_condition(features, labels):
    mask = labels.label == 'negctrl'
    negcontroll_ids = list(labels[mask].index)
    poscontroll_ids = list(labels[~mask].index)

    negcontroll = features[features.index.isin(negcontroll_ids)]
    poscontroll = features[features.index.isin(poscontroll_ids)]
    normalized_features = utils.normalize(poscontroll, negcontroll)
    return normalized_features

normalized_profile_features = normalize_treatment_condition(
    profile_site_features, profile_site_labels)

np.shape(normalized_profile_features)

(5356, 3787)

In [13]:
print('Before drop na profile shape: {}'.format(np.shape(normalized_profile_features)))
normalized_profile_features  = normalized_profile_features.dropna(axis=1)
print('After drop na profile shape: {}'.format(np.shape(normalized_profile_features)))

Before drop na profile shape: (5356, 3787)
After drop na profile shape: (5356, 3787)


In [14]:
if SAVE_NORMALIZED_PROFILE:
    (
        normalized_profile_features
        .to_csv(SITE_NORMALIZED_PROFILE)  
    )

## Well Level

### Create Well Profiles

In [8]:
def aggregate_site_to_well_level(dataframe):
    dataframe.site_id = dataframe.site_id.apply(lambda x: x[:-2])
    dataframe = dataframe.rename({'site_id': 'well_id'}, axis=1).groupby('well_id').mean()
    return dataframe

In [9]:
aggregate_profile = aggregate_site_to_well_level(profile)

well_profile = (
    well_metadata.merge(aggregate_profile, how='left', on ='well_id')
    .drop_duplicates('well_id')
    .set_index('well_id')
)
np.shape(well_profile)

(1340, 3920)

### Damaged Wells

In [10]:
damaged_well_profile = well_profile[well_profile.AreaShape_BoundingBoxArea.isna()].index.tolist()
well_profile  = well_profile[~well_profile.AreaShape_BoundingBoxArea.isna()]
damaged_well_profile

[]

### Drop Null Features

In [11]:
print('Before drop na profile shape: {}'.format(np.shape(well_profile)))
well_profile = well_profile.dropna(axis=1)
print('After drop na profile shape: {}'.format(np.shape(well_profile)))

Before drop na profile shape: (1340, 3920)
After drop na profile shape: (1340, 3909)


### Treatment Group Information 

In [12]:
def get_group_info(data):
    return (
        data[['label', 'groupid']]
        .groupby('groupid').count()
        .reset_index()
        .groupby('label')
        .count()
        .rename_axis('group_size')
        .rename({'groupid' : 'count'}, axis=1)
    )

print('Profile treatment-condition groups information:')
get_group_info(well_profile)

Profile treatment-condition groups information:


Unnamed: 0_level_0,count
group_size,Unnamed: 1_level_1
2,588
30,1
40,2
54,1


### Split Features and Labels

In [13]:
def split_features_and_labels(dataset, labels_view, features_dropout):
    labels = dataset.filter(items=labels_view)
    features = dataset.drop(features_dropout, axis=1)
    return features, labels

profile_well_features, profile_well_labels = split_features_and_labels(
    well_profile,
    labels_view=['groupid', 'label'],
    features_dropout=['groupid', 'label', 'treatment', 'treatment_conc', 'disease_condition', 'site_id']
)

### Normalize Profile
Noramlization function:
$$ \hat{X} = \frac{X - \bar{C}_{-}}{MAD(C_{-})}$$

In [17]:
def normalize_treatment_condition(features, labels):
    mask = labels.label == 'negctrl'
    negcontroll_ids = list(labels[mask].index)
    poscontroll_ids = list(labels[~mask].index)

    negcontroll = features[features.index.isin(negcontroll_ids)]
    poscontroll = features[features.index.isin(poscontroll_ids)]
    normalized_features = utils.normalize(poscontroll, negcontroll)
    return normalized_features

normalized_profile_features = normalize_treatment_condition(
    profile_well_features, profile_well_labels)

np.shape(normalized_profile_features)

(1340, 3791)

### Save Normalized Feature

In [22]:
if SAVE_NORMALIZED_PROFILE:
    (
        normalized_profile_features
        .join(profile_well_labels)
        .to_csv(WELL_NORMALIZED_PROFILE)  
    )