In [1]:
from datetime import date
import pandas as pd
import os
import glob

# Load data FMP HepG2 dataset

This notebook loads the aggregated data for the HepG2 data from the FMP

In [2]:
parent_directory = "/home/schmiedc/FMP_Docs/Projects/Bioactives_data/"

# Path 
data_path =  parent_directory  + "aggregated_data/FMP_HepG2/"
save_path = parent_directory  + "results/"
annot_path = parent_directory  + "annotations/"

In [3]:
### loading latest data
Data = []
plates = ["B1001","B1002","B1003","B1004","B1005","B1006","B1007"]
batches = ["R1", "R2", "R3", "R4"]
path = data_path

for batch in batches:
    for plate in plates:
    
        file_type = '/*[0-9]_*' + batch + "_" + plate + "_CP_Profiles_Aggregated.csv"
        
        files_temp = glob.glob(path + file_type)
        max_file = max(files_temp, key=os.path.getctime)
    
        ### load file
        Data_Temp = pd.read_csv(max_file)
        print(batch, plate, "has shape", Data_Temp.shape)  
        ### put to list
        Data.append(Data_Temp)

### concat all batches together
Data_aggregated = pd.concat(Data)
Data_aggregated = Data_aggregated.reset_index(drop = True)

print("Aggregated Data has shape ", Data_aggregated.shape)

R1 B1001 has shape (382, 2981)
R1 B1002 has shape (381, 2981)
R1 B1003 has shape (381, 2981)
R1 B1004 has shape (382, 2981)
R1 B1005 has shape (380, 2981)
R1 B1006 has shape (384, 2981)
R1 B1007 has shape (382, 2981)
R2 B1001 has shape (382, 2981)
R2 B1002 has shape (382, 2981)
R2 B1003 has shape (381, 2981)
R2 B1004 has shape (383, 2981)
R2 B1005 has shape (380, 2981)
R2 B1006 has shape (384, 2981)
R2 B1007 has shape (382, 2981)
R3 B1001 has shape (383, 2981)
R3 B1002 has shape (384, 2981)
R3 B1003 has shape (382, 2981)
R3 B1004 has shape (381, 2981)
R3 B1005 has shape (382, 2981)
R3 B1006 has shape (383, 2981)
R3 B1007 has shape (382, 2981)
R4 B1001 has shape (383, 2981)
R4 B1002 has shape (383, 2981)
R4 B1003 has shape (381, 2981)
R4 B1004 has shape (382, 2981)
R4 B1005 has shape (379, 2981)
R4 B1006 has shape (384, 2981)
R4 B1007 has shape (382, 2981)
Aggregated Data has shape  (10697, 2981)


# Load plate annotations

This adds the matching annotations to associate the well with the EOS number

In [4]:
file_type = '/*[0-9]_' + 'Annotation_Bioactives_HepG2.csv'
files = glob.glob(annot_path + file_type)

### gets latest file
max_file = max(files, key=os.path.getctime)

### load file
Annotation_Bioactives = pd.read_csv(max_file)
Annotation_Bioactives.shape

(10752, 6)

## Merge in EOS number

In [5]:
Data_Bioactives = pd.merge(
    Annotation_Bioactives, 
    Data_aggregated, 
    on = ["Metadata_Batch", "Metadata_Plate", "Metadata_Well"], 
    how = "inner")
        
print("Aggregated and annotated Data has shape ", Data_Bioactives.shape)

Aggregated and annotated Data has shape  (10697, 2984)


# Save raw data

In [6]:
filename = save_path + str(date.today()) + "_FMP_HepG2_raw.csv"
Data_Bioactives.to_csv(filename, index = False)

## Save missing data

Missing data here means that there is no segmentation available. That means either the images were missing (unlikely) or there was not segmentation possible or failed. If the segmentation failed the wells could be empty from the beginning or that the cells are all dead from the application of the compound. 

Here we use it as an indication of toxicity. Although with missing wells these other explanation (no images, no cells applied) cannot be totally excluded.

In [7]:
Data_Bioactives_missing = pd.merge(
    Annotation_Bioactives, 
    Data_aggregated, 
    on = ["Metadata_Batch", "Metadata_Plate", "Metadata_Well"], 
    how = "left",
    indicator = True)

Data_Bioactives_missing = Data_Bioactives_missing[Data_Bioactives_missing["_merge"] == 'left_only'].dropna(axis='columns')

In [8]:
### 
filename_missing = save_path + str(date.today()) + "_FMP_HepG2_raw_missing_wells.csv"
Data_Bioactives_missing.to_csv(filename_missing, index = False)

In [9]:
print("There are",len(Data_Bioactives_missing), "missing wells.")

There are 55 missing wells.


In [10]:
Data_Bioactives_missing_counts = Data_Bioactives_missing['Metadata_EOS'].value_counts()
print("There are",len(Data_Bioactives_missing_counts[Data_Bioactives_missing_counts.iloc[:] == 4]), "compounds missing all 4 replicates.")

There are 5 compounds missing all 4 replicates.
