# Cytof data processing

## Variables definition

In [None]:
# Specify the folder containing data to analyse
folder_path = "./test_cytof_data"

# Specify the metadata columns
metadata_string_columns = []
metadata_other_columns = ["metada"]
metadata_columns = metadata_string_columns + metadata_other_columns

# Specify other columns to exclude from processing
excluded_columns = ["exclu"]

# Compute the non data columns in a new variable for easier later use
non_data_columns = excluded_columns + metadata_columns

['exclu', 'metada']

## Import common packages

In [None]:
import os
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 1000)

## Concatenate all files

In [None]:
all_events = pd.DataFrame()

# Loop over all files in folder
for filename in os.listdir(folder_path):
    # Only consider files with '.txt' extension
    if filename.endswith('.txt'):
        # Build the full path to file
        file_path = os.path.join(folder_path, filename)
        # Load the file
        events = pd.read_csv(file_path, delimiter='\t')

        # Retrieve metadata from the filename (ex: WGANormalised_Pro_PDO21_01.fcs_file_internal...)
        # First split: ['WGANormalised_Pro_PDO21_01', '_file_internal']
        # Second split over first element: ['WGANormalised', 'Pro', 'PDO21', '01']
        metadata_from_filename = filename.split('.fcs')[0].split('_')
        # Store the condition name in the dataframe: second-to-last element
        events['Condition'] = metadata_from_filename[-2]
        # Store the replicate in the dataframe: last element
        events['Replicate'] = metadata_from_filename[-1]

        # Add the file data to the DataFrame containing all events
        all_events = pd.concat([all_events, events], ignore_index=True)

# Print all events
all_events

## EMD Generation

### Prepare the data

In [None]:
#drop the metadata to create a df with only numerical data for normalisation/transformation
data = all_events.drop(non_data_columns ,axis=1)
data

In [None]:
#make sure all metadata columns are strings (not numberical as this will run into errors)
# metadata = all_data.filter(['Date','Patient','Culture','gd_donor','Transduction','Treatment','Replicate','Time','Batch','Cell_type'])
# metadata['Batch'] = metadata['Batch'].apply(str)
# metadata['gd_donor'] = metadata['gd_donor'].apply(str)
# metadata['Patient'] = metadata['Patient'].apply(str)
# metadata['Treatment'] = metadata['Treatment'].apply(str)
# metadata['Transduction'] = metadata['Transduction'].apply(str)

metadata = all_events.filter(metadata_columns)
metadata[metadata_string_columns] = metadata[metadata_string_columns].apply(str)
metadata  

### Select a subset of data (optional)

In [None]:
#Batches:
#Batch 1 = PDO27wt/ko exp B BM/MOPC21/B7C18
#Batch 2 = PDO27 ABCEDF7 Tr
#Batch 3 = PDO27 ABCDEF7 NT
#Batch 4 = PDO21/23/216 ABE7 Tr
#Batch 5 = PDO21/23/216 ABE7 NT 
#Batch 6 = PDO5/11 ABE7 Tr/NT
#Batch 7 = PDO75/99 ABE7 Tr/NT
#Batch 8 = PDO109/141 ABE7 Tr/NT
#Batch 9 = NT/eGFP/eGFP-stIL15 ABE7

#### Configuration

In [None]:
# To enable this process, set this variable to True, False otherwise
should_select_a_subset = False

# Define here the filter to apply
subset_condition = \
    metadata['Patient'].isin(['X','5','11','21','23','27','75','99','109','141','216']) & \
    metadata['gd_donor'].isin(['A','B','E','7']) & \
    metadata['Transduction'].isin(['eGFP-stIL15']) & \
    metadata['Treatment'].isin(['BM','B7C18']) & \
    metadata['Batch'].isin(['Batch2','Batch4','Batch6','Batch7','Batch8'])

#### Select the data

In [None]:
if should_select_a_subset:
    #Select eGFP-stIL15 / ABE7 / wt PDO / BM / B7C18 (I was just selecting the data I wanted to use)
    data = data.loc[subset_condition]
    data

#### Select the metadata

In [None]:
if should_select_a_subset:
    #selecting the corresponding metadata
    metadata = metadata.loc[subset_condition]
    metadata

### Arcsinh transformation

#### Configuration

In [None]:
arcsinh_cofactor = 5

#### Data processing

In [None]:
#arcsinh transformation of all raw data
data = np.arcsinh(data/arcsinh_cofactor)
data

### Batch effect correction

In [None]:
import scprep

# Data centering by batch to correct any cytof batch effect
# Only if 'Batch' is a metadata
if 'Batch' in metadata.columns:
    data = scprep.normalize.batch_mean_center(data,sample_idx=metadata['Batch'])
    data

### Re-assemble processed data with metadata

#### Concatenate data with metadata

In [None]:
# Combine arcsinh-transformed and mean-centered data with metadata again
processed_data = pd.concat([data, metadata], axis=1)
processed_data

#### Re-index the Dataframe

In [None]:
row_count = processed_data.shape[0]
processed_data.index = np.arange(row_count)

#### Ensure type of metadata column to be string

In [None]:

processed_data[metadata_string_columns] = processed_data[metadata_string_columns].apply(str)

# data_as_meta['Date'] = data_as_meta['Date'].apply(str)
# data_as_meta['Patient'] = data_as_meta['Patient'].apply(str)
# data_as_meta['Culture'] = data_as_meta['Culture'].apply(str)
# data_as_meta['gd_donor'] = data_as_meta['gd_donor'].apply(str)
# data_as_meta['Transduction'] = data_as_meta['Transduction'].apply(str)
# data_as_meta['Treatment'] = data_as_meta['Treatment'].apply(str)
# data_as_meta['Replicate'] = data_as_meta['Replicate'].apply(str)
# data_as_meta['Time'] = data_as_meta['Time'].apply(str)
# data_as_meta['Batch'] = data_as_meta['Batch'].apply(str)

### Store the `Condition` information

#### Configuration

In [None]:
condition_colmns = ['Patient', 'Culture', 'gd_donor', 'Transduction', 'Treatment', 'Batch', 'Date', 'Replicate']

#### Generate the `Condition` column

In [None]:
# Create a condition column for every cell in the experiment
processed_data['Condition'] = processed_data[condition_colmns].astype(str).agg('_'.join, axis=1)

# Add `Condition` to the list of metadata columns
metadata_columns += 'Condition'

processed_data

### Store the `Control` information

#### Configuration

In [None]:
# All gd monoculture controls including their transduction, treatment and batch.
control_columns = ['gd_donor', 'Transduction', 'Treatment', 'Batch', 'Date']

#### Generate the `Control` column

In [None]:
# Define control for pairwise EMD. 
processed_data['Control'] = "X_gd_" + processed_data[control_columns].astype(str).agg('_'.join, axis=1)

# Add `Control` to the list of metadata columns
metadata_columns += 'Control'

processed_data

### Initialise EMD dataframe

#### Compute the markers list

In [None]:
# For each column in the Dataframe, keep only the ones not in the `metadata_columns` variable
markers_list = [col for col in processed_data.columns if col not in metadata_columns]
# marker_list = list(processed_data.columns.values)
markers_list

#### Compute the conditions list

In [None]:
# Get the list of unique conditions
conditions_list = pd.unique(processed_data['Condition'].tolist())

#### Compute the controls list (unused)

In [None]:
# Get the list of unique controls
controls_list = pd.unique(processed_data['Control'].tolist())

#### Create the DataFrame that will receive the EMD values

In [None]:
# Empty df with NaN values to populate with the EMD values
emd_dataframe = pd.DataFrame(
    np.full(
        (len(conditions_list), len(markers_list)), 
        np.nan),
    columns = markers_list,
    index = conditions_list)


### Calculate EMD scores

In [None]:
# Loop over all the conditions
for condition in conditions_list:

    # Dataframe of all events for the condition in the list
    condition_events = processed_data.loc[(processed_data["Condition"] == condition)]

    control_name = condition_events['Control'].values[0]
    print(control_name)

    # Dataframe of all events from the control that will be compared with the events of the current condition
    control_df = processed_data.loc[processed_data["Condition"].str.startswith(control_name)]

    # Loop over all the markers
    for marker in markers_list:

        # Check the sign by using the `median` values
        sign = np.sign(condition_events[marker].median() - control_df[marker].median())

        # In case the median values are equal, use the `mean` instead
        if sign == 0:
            sign = np.sign(condition_events[marker].mean() - control_df[marker].mean())

        # Compute the EMD by multiplying the sign by the EMD score
        emd = scprep.stats.EMD(
            condition_events[marker], 
            control_df[marker]
        )

        # Store the signed EMD in the result Dataframe for the given (condition, marke) pair
        emd_dataframe.loc[condition, marker] = sign * emd

# Ensure that all (condition, marke) pairs have been properly computed
assert not emd_dataframe.isna().values.any()

In [None]:
emd_dataframe