# Cytof data processing

## Variables definition

In [None]:
# Specify the folder containing data to analyse
folder_path = "./test_cytof_data"

# Specify the metadata columns
metadata_string_columns = []
metadata_other_columns = ["metada"]
metadata_columns = metadata_string_columns + metadata_other_columns

# Specify other columns to exclude from processing
excluded_columns = ["exclu"]

# Compute the non data columns in a new variable for easier later use
non_data_columns = excluded_columns + metadata_columns

['exclu', 'metada']

## Import common packages

In [None]:
import os
import pandas as pd

pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 1000)

## Concatenate all files

In [None]:
all_events = pd.DataFrame()

# Loop over all files in folder
for filename in os.listdir(folder_path):
    # Only consider files with '.txt' extension
    if filename.endswith('.txt'):
        # Build the full path to file
        file_path = os.path.join(folder_path, filename)
        # Load the file
        events = pd.read_csv(file_path, delimiter='\t')
        # Add the file data to the DataFrame containing all events
        all_events = pd.concat([all_events, events], ignore_index=True)

# Print all events
all_events

## EMD Generation

### Prepare the data

In [None]:
#drop the metadata to create a df with only numerical data for normalisation/transformation
data = all_events.drop(non_data_columns ,axis=1)
data

In [None]:
#make sure all metadata columns are strings (not numberical as this will run into errors)
# metadata = all_data.filter(['Date','Patient','Culture','gd_donor','Transduction','Treatment','Replicate','Time','Batch','Cell_type'])
# metadata['Batch'] = metadata['Batch'].apply(str)
# metadata['gd_donor'] = metadata['gd_donor'].apply(str)
# metadata['Patient'] = metadata['Patient'].apply(str)
# metadata['Treatment'] = metadata['Treatment'].apply(str)
# metadata['Transduction'] = metadata['Transduction'].apply(str)
metadata = all_events.filter(metadata_columns)
metadata[metadata_string_columns] = metadata[metadata_string_columns].apply(str)
metadata  

### Select a subset of data (optional)

In [None]:
#Batches:
#Batch 1 = PDO27wt/ko exp B BM/MOPC21/B7C18
#Batch 2 = PDO27 ABCEDF7 Tr
#Batch 3 = PDO27 ABCDEF7 NT
#Batch 4 = PDO21/23/216 ABE7 Tr
#Batch 5 = PDO21/23/216 ABE7 NT 
#Batch 6 = PDO5/11 ABE7 Tr/NT
#Batch 7 = PDO75/99 ABE7 Tr/NT
#Batch 8 = PDO109/141 ABE7 Tr/NT
#Batch 9 = NT/eGFP/eGFP-stIL15 ABE7

#### Configuration

In [None]:
# To enable this process, set this variable to True, False otherwise
should_select_a_subset = False

# Define here the filter to apply
subset_condition = \
    metadata['Patient'].isin(['X','5','11','21','23','27','75','99','109','141','216']) & \
    metadata['gd_donor'].isin(['A','B','E','7']) & \
    metadata['Transduction'].isin(['eGFP-stIL15']) & \
    metadata['Treatment'].isin(['BM','B7C18']) & \
    metadata['Batch'].isin(['Batch2','Batch4','Batch6','Batch7','Batch8'])

#### Select the data

In [None]:
if should_select_a_subset:
    #Select eGFP-stIL15 / ABE7 / wt PDO / BM / B7C18 (I was just selecting the data I wanted to use)
    data = data.loc[subset_condition]
    data

#### Select the metadata

In [None]:
if should_select_a_subset:
    #selecting the corresponding metadata
    metadata = metadata.loc[subset_condition]
    metadata

### Arcsinh transformation

#### Import package and configuration

In [None]:
import numpy as np

arcsinh_cofactor = 5

#### Data processing

In [None]:
#arcsinh transformation of all raw data
data = np.arcsinh(data/arcsinh_cofactor)
data

# HERE

In [None]:
#data centering by batch to correct any cytof batch effect
as_data_centered = scprep.normalize.batch_mean_center(as_data,sample_idx=subset_metadata['Batch'])
as_data_centered

In [None]:
#combine arcsinh-transformed and mean-centered data with metadata again
data_as_meta = pd.concat([as_data_centered, subset_metadata], axis=1)
data_as_meta

In [None]:
data_as_meta.index = np.arange (data_as_meta.shape[0])
data_as_meta['Date'] = data_as_meta['Date'].apply(str)
data_as_meta['Patient'] = data_as_meta['Patient'].apply(str)
data_as_meta['Culture'] = data_as_meta['Culture'].apply(str)
data_as_meta['gd_donor'] = data_as_meta['gd_donor'].apply(str)
data_as_meta['Transduction'] = data_as_meta['Transduction'].apply(str)
data_as_meta['Treatment'] = data_as_meta['Treatment'].apply(str)
data_as_meta['Replicate'] = data_as_meta['Replicate'].apply(str)
data_as_meta['Time'] = data_as_meta['Time'].apply(str)
data_as_meta['Batch'] = data_as_meta['Batch'].apply(str)

In [None]:
full_data = data_as_meta

In [None]:
#creat a condition column for every cell in the experiment. Also create a list of all conditions
full_data['Condition'] = full_data['Patient'] + '_' + full_data['Culture'] + '_' + full_data['gd_donor'] + '_' + full_data['Transduction'] + '_' + full_data['Treatment'] + '_' + full_data['Batch'] + '_' + full_data['Date'] + '_' + full_data['Replicate']
grouped = full_data.groupby(
    [
        "Condition"
    ]
, as_index=False).mean()

full_data.index = np.arange (full_data.shape[0])
condition_list = pd.unique(full_data['Condition'].tolist())
full_data

In [None]:
#Define control for pairwise EMD. Here, the controls are all gd monoculture controls including their transduction, treatment and batch. 
full_data["Control"] =  "X_gd_" + full_data["gd_donor"] + '_' + full_data["Transduction"] + '_' +  full_data["Treatment"] + '_' +  full_data["Batch"] + '_' + full_data['Date']


In [None]:
full_data

In [None]:
no_metadata = full_data.drop(
['Date',
 'Patient',
 'Culture',
 'gd_donor',
 'Transduction',
 'Treatment',
 'Replicate',
 'Time',
 'Batch',
 'Cell_type',
 'Control',
'Condition',
], axis=1, inplace=False)
no_metadata

In [None]:
marker_list = list(no_metadata.columns.values)

In [None]:
condition_list

In [None]:
#empty df with NaN values to populate with the EMD values you calculate
coculture_emds = pd.DataFrame(
    np.full((len(condition_list), len(marker_list)), np.nan),
    columns=marker_list,
    index=condition_list)
#coculture_emds

In [None]:
control_list = pd.unique(full_data['Control'].tolist())
#control_list

In [None]:
 #Calculates EMD scores. 'each_line' is a dataframe of all cells from one condition in the list. 'control_df' is a dataframe of all cells from the control that will be compared with 'each_line'. 
 
 for condition in condition_list:
    each_line = full_data.loc[(full_data["Condition"] == condition)]
    control_line = each_line['Control']
    print(control_line.values[0])
    control_df = full_data.loc[full_data["Condition"].str.startswith(control_line.values[0])]
#     print(condition,control_df["Control"].values[0])
    for marker in marker_list:
                    sign = np.sign(each_line[marker].median() - control_df[marker].median())
                    if sign == 0:
                        sign = np.sign(each_line[marker].mean() - control_df[marker].mean())
                    signed_emd = sign*scprep.stats.EMD(
                        each_line[marker], control_df[marker]
            )
                    coculture_emds.loc[condition, marker] = signed_emd

assert not coculture_emds.isna().values.any()

In [None]:
coculture_emds