# Cytof data processing

## Requirements

For compatibility, the `numpy` version needs to be fixed:
```bash
pip3 install --upgrade pandas numpy==1.26.4 scprep
```

## Variables definition

In [7]:
# Specify the folder containing data to analyse
folder_path = "./test_cytof_data"

# Specify the metadata columns
metadata_columns = ['Cell_Index', 'Condition', 'Control', 'Replicate']

# Specify other columns to exclude from processing
excluded_columns = []

# Compute the non data columns in a new variable for easier later use
non_data_columns = excluded_columns + metadata_columns

## Import common packages

In [8]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 1000)

## Concatenate all files

### Configuration

In [9]:
get_condition_replicate_from_filename = True

### Concatenate

In [10]:
all_events = pd.DataFrame()

# Loop over all files in folder
for filename in os.listdir(folder_path):
    # Only consider files with '.txt' extension
    if filename.endswith('.txt'):
        # Build the full path to file
        file_path = os.path.join(folder_path, filename)
        # Load the file
        events = pd.read_csv(file_path, delimiter='\t')

        if get_condition_replicate_from_filename:
            # Retrieve metadata from the filename (ex: WGANormalised_Pro_PDO21 + CAFs_01.fcs_file_internal...)
            # First split: ['WGANormalised_Pro_PDO21 + CAFs_01', '_file_internal']
            # Second split over first element: ['WGANormalised', 'Pro', 'PDO21', '01']
            metadata_from_filename = filename.split('.fcs')[0].split('_')
            # Store the condition name in the dataframe: second-to-last element
            events['Condition'] = metadata_from_filename[-2]
            # Store the control name in the dataframe: second-to-last element split over '+', and stripped to remove whitespace from both sides
            events['Control'] = metadata_from_filename[-2].split('+')[0].strip()
            # Store the replicate in the dataframe: last element
            events['Replicate'] = metadata_from_filename[-1]

        # Add the file data to the DataFrame containing all events
        all_events = pd.concat([all_events, events], ignore_index=True)

# Print all events
all_events

Unnamed: 0,Cell_Index,89Y_pHH3_S28_v7,96Ru_96Ru,98Ru_98Ru,99Ru_99Ru,100Ru_100Ru,101Ru_101Ru,102Ru_102Ru,104Ru_104Ru,111Cd_Vimentin RV202 (v67,112Cd_FAP (1) v2,113In_CD326 (EpCAM) (hu) (v6),114Cd_CK18 (v6),115In_Pan-CK_v9,116Cd_GFP_v4,127I_IdU,142Nd_cCaspase 3_D175_v6,143Nd_RRM2,144Nd_SOX2 v2,145Nd_pNDRG1 T346 v4,146Nd_L1CAM,147Sm_OPTN,148Nd_CDK1 (1),149Sm_p4E-BP1_T37,150Nd_pRB_S807_S811_v10,151Eu_sqstm1,153Eu_ANXA1,155Gd_pAKT [S473] v12,156Gd_pNF-kB p65 v8,157Gd_MOPC21,158Gd_pP38 MAPK v7,160Gd_KI67(3),161Dy_pLATS1,163Dy_H3K9Me3,164Dy_TOP2A (3),165Ho_AlexaFluor488,167Er_TROP 2(1),168Er_pSMAD2,169Tm_EphB2,170Er_CHGA v3,171Yb_CD55 v4,172Yb_BIRC3,173Yb_pS6,174Yb_cPARP [D214] (2) (v6),176Yb_CyclinB1 (2) (v7),191Ir_DNA 1,193Ir_DNA 2,209Bi_Me2HH3[K4],Condition,Control,Replicate
0,1,43.585557,1213.959853,3558.386958,0.000000,1458.946308,421.249051,707.013792,478.102051,112.645929,603.746433,50897.801994,1594.755375,53993.672474,10.087679,28051.093144,1690.470631,394.401844,1997.347396,31736.874224,365.299426,630.263993,6095.601789,3.853920e+04,2672.376730,46715.681153,372.530023,1727.848742,4939.136139,251.864645,15661.175919,979.785837,13075.133833,1090.477880,2.310142e+03,0.004876,390.414788,724.547763,11229.491117,438.423305,838.639540,102.624017,2751.651199,1059.449506,2758.233772,1.783434e+05,3.267123e+05,1.320918e+04,PDO21 + hIL8 12.5ng,PDO21,02
1,2,0.000000,0.000000,0.000000,0.000000,0.000000,1106.766267,483.146464,0.000000,0.000000,6212.441179,112524.540240,3376.703678,63105.115731,2026.345475,78076.598058,5378.106523,3483.206300,14774.307667,16404.612777,0.000000,6325.650928,15999.116990,9.359513e+04,19611.374816,114839.243925,7337.893752,3590.558857,51435.443529,1505.413488,38390.718929,5494.854223,51952.686435,6593.492500,9.707502e+03,0.000554,3019.134512,583.799483,57342.426824,0.000000,9371.121594,253.987820,12415.285065,1217.250647,3465.030644,1.138082e+06,2.033536e+06,1.273348e+05,PDO21 + hIL8 12.5ng,PDO21,02
2,3,0.000000,0.000000,0.000000,1731.737952,3995.250069,562.508254,0.000000,181.267014,0.000000,211.807458,86705.838045,2220.989030,39521.113254,449.903746,5294.075421,7258.527181,2360.692782,2230.335842,4746.109479,3430.414044,2534.428668,23916.033206,5.296711e+04,4278.013643,25919.317740,218.391683,1234.880145,13760.759509,1343.636866,15913.337755,2911.830422,26219.449966,6317.144171,3.358493e+05,0.001748,328.106387,39.487448,22343.036966,1658.007231,2055.223981,0.000000,2376.693641,3997.301349,1056.888078,4.014445e+05,7.436298e+05,5.654798e+04,PDO21 + hIL8 12.5ng,PDO21,02
3,4,0.000000,2672.189928,0.000000,94.561986,0.000000,0.000000,2129.264680,0.000000,0.000000,0.000000,49121.290729,443.273420,66948.519779,434.657481,0.000000,0.000000,5211.869347,993.812809,1650.902655,0.000000,3786.186858,8147.540472,6.239263e+04,17666.475488,57566.085523,1134.561648,5410.785732,14088.210476,602.732347,155766.258197,3576.587010,23858.022486,3329.592530,1.212069e+04,0.000901,4550.623410,8163.213997,1948.253067,0.000000,3575.725906,0.000000,1061.280373,0.000000,3629.147982,7.199613e+05,1.275512e+06,3.018831e+05,PDO21 + hIL8 12.5ng,PDO21,02
4,5,0.000000,8911.913747,0.000000,22469.635814,1624.563602,7755.338861,0.000000,8886.237503,0.000000,0.000000,790518.367838,26739.996798,392637.619681,0.000000,170897.126636,7479.105856,25645.619233,52787.568714,53050.173847,23978.255141,5048.944674,113671.838662,1.089417e+06,217428.443030,501488.031274,0.000000,6905.471330,137955.596855,0.000000,269985.782167,129138.747335,536115.645320,58814.594023,7.214307e+06,0.000078,10430.904898,58470.340866,44148.812264,6660.390475,5337.305140,0.000000,36145.172415,40954.813217,65518.278247,7.275846e+06,1.301274e+07,1.797610e+06,PDO21 + hIL8 12.5ng,PDO21,02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331313,3477,362.390948,992.681035,2068.121114,260.349544,316.508499,0.000000,298.312521,0.000000,112.635680,392.738686,50437.660151,4469.028381,92466.053505,117.430747,6044.766148,1035.111273,972.697009,993.158569,1250.776576,943.535093,2870.155451,2049.797486,1.320265e+04,2251.043351,38214.616564,371.768906,937.367092,6934.300288,888.597777,17701.803073,847.845783,15227.675305,3686.257977,6.900475e+02,0.004716,838.881231,938.125302,6682.272450,1052.873953,804.977090,489.496254,4155.716466,1741.168901,5001.590085,1.799795e+05,3.174157e+05,1.902812e+04,PDO21 + hWnt3a 50ng,PDO21,01
331314,3478,372.663608,724.611709,0.000000,0.000000,35.160427,0.000000,829.251021,608.686856,0.000000,0.000000,88656.799304,0.000000,13176.904401,1042.452642,577.555315,1010.994956,853.626242,658.551980,4850.466761,662.469186,0.000000,18824.572962,5.653982e+04,5864.233023,21422.392967,531.357432,2174.363275,6732.846478,879.626762,33613.033575,3089.377985,7933.601307,2556.600206,2.081537e+05,0.004529,597.363041,348.786817,9553.137868,479.746291,2543.308891,360.295789,4413.866547,925.472703,1162.140184,3.706437e+05,6.465320e+05,1.202575e+04,PDO21 + hWnt3a 50ng,PDO21,01
331315,3479,12.442780,0.000000,0.000000,0.000000,77.353153,133.785501,181.986353,254.284758,19.052205,14.854769,3424.966362,85.885916,1172.260957,245.247535,0.000000,138.028785,27.924660,300.020467,648.970828,0.000000,191.587576,231.235996,7.506321e+02,476.929853,2679.400691,124.067507,209.814890,952.209344,173.123581,1926.201105,183.520701,585.860364,271.621016,1.082461e+03,0.025516,118.110472,179.070468,702.866085,818.625625,79.789012,61.177457,159.688245,73.685651,63.775938,2.602718e+04,4.795381e+04,1.629098e+03,PDO21 + hWnt3a 50ng,PDO21,01
331316,3480,59.816495,559.819517,899.679879,448.216709,0.000000,2819.140971,2075.140984,1882.903506,4726.652404,2184.649715,11797.323451,893.115372,111251.297405,1149.331822,7077.478208,33031.760020,0.000000,3585.608417,189.923909,506.207177,630.546302,0.000000,3.442723e+02,1805.169366,29501.166536,673.086272,1867.260163,5871.036522,2330.769157,4417.229336,705.743482,0.000000,648.056076,3.099320e+03,0.001744,575.907007,995.043718,2106.405927,0.000000,0.000000,203.907664,456.791537,4348.993633,2457.059358,7.204364e+05,1.381995e+06,7.844661e+02,PDO21 + hWnt3a 50ng,PDO21,01


## EMD Generation

### Prepare the data

In [11]:
#drop the metadata to create a df with only numerical data for normalisation/transformation
data = all_events.drop(non_data_columns ,axis=1)
data

Unnamed: 0,89Y_pHH3_S28_v7,96Ru_96Ru,98Ru_98Ru,99Ru_99Ru,100Ru_100Ru,101Ru_101Ru,102Ru_102Ru,104Ru_104Ru,111Cd_Vimentin RV202 (v67,112Cd_FAP (1) v2,113In_CD326 (EpCAM) (hu) (v6),114Cd_CK18 (v6),115In_Pan-CK_v9,116Cd_GFP_v4,127I_IdU,142Nd_cCaspase 3_D175_v6,143Nd_RRM2,144Nd_SOX2 v2,145Nd_pNDRG1 T346 v4,146Nd_L1CAM,147Sm_OPTN,148Nd_CDK1 (1),149Sm_p4E-BP1_T37,150Nd_pRB_S807_S811_v10,151Eu_sqstm1,153Eu_ANXA1,155Gd_pAKT [S473] v12,156Gd_pNF-kB p65 v8,157Gd_MOPC21,158Gd_pP38 MAPK v7,160Gd_KI67(3),161Dy_pLATS1,163Dy_H3K9Me3,164Dy_TOP2A (3),165Ho_AlexaFluor488,167Er_TROP 2(1),168Er_pSMAD2,169Tm_EphB2,170Er_CHGA v3,171Yb_CD55 v4,172Yb_BIRC3,173Yb_pS6,174Yb_cPARP [D214] (2) (v6),176Yb_CyclinB1 (2) (v7),191Ir_DNA 1,193Ir_DNA 2,209Bi_Me2HH3[K4]
0,43.585557,1213.959853,3558.386958,0.000000,1458.946308,421.249051,707.013792,478.102051,112.645929,603.746433,50897.801994,1594.755375,53993.672474,10.087679,28051.093144,1690.470631,394.401844,1997.347396,31736.874224,365.299426,630.263993,6095.601789,3.853920e+04,2672.376730,46715.681153,372.530023,1727.848742,4939.136139,251.864645,15661.175919,979.785837,13075.133833,1090.477880,2.310142e+03,0.004876,390.414788,724.547763,11229.491117,438.423305,838.639540,102.624017,2751.651199,1059.449506,2758.233772,1.783434e+05,3.267123e+05,1.320918e+04
1,0.000000,0.000000,0.000000,0.000000,0.000000,1106.766267,483.146464,0.000000,0.000000,6212.441179,112524.540240,3376.703678,63105.115731,2026.345475,78076.598058,5378.106523,3483.206300,14774.307667,16404.612777,0.000000,6325.650928,15999.116990,9.359513e+04,19611.374816,114839.243925,7337.893752,3590.558857,51435.443529,1505.413488,38390.718929,5494.854223,51952.686435,6593.492500,9.707502e+03,0.000554,3019.134512,583.799483,57342.426824,0.000000,9371.121594,253.987820,12415.285065,1217.250647,3465.030644,1.138082e+06,2.033536e+06,1.273348e+05
2,0.000000,0.000000,0.000000,1731.737952,3995.250069,562.508254,0.000000,181.267014,0.000000,211.807458,86705.838045,2220.989030,39521.113254,449.903746,5294.075421,7258.527181,2360.692782,2230.335842,4746.109479,3430.414044,2534.428668,23916.033206,5.296711e+04,4278.013643,25919.317740,218.391683,1234.880145,13760.759509,1343.636866,15913.337755,2911.830422,26219.449966,6317.144171,3.358493e+05,0.001748,328.106387,39.487448,22343.036966,1658.007231,2055.223981,0.000000,2376.693641,3997.301349,1056.888078,4.014445e+05,7.436298e+05,5.654798e+04
3,0.000000,2672.189928,0.000000,94.561986,0.000000,0.000000,2129.264680,0.000000,0.000000,0.000000,49121.290729,443.273420,66948.519779,434.657481,0.000000,0.000000,5211.869347,993.812809,1650.902655,0.000000,3786.186858,8147.540472,6.239263e+04,17666.475488,57566.085523,1134.561648,5410.785732,14088.210476,602.732347,155766.258197,3576.587010,23858.022486,3329.592530,1.212069e+04,0.000901,4550.623410,8163.213997,1948.253067,0.000000,3575.725906,0.000000,1061.280373,0.000000,3629.147982,7.199613e+05,1.275512e+06,3.018831e+05
4,0.000000,8911.913747,0.000000,22469.635814,1624.563602,7755.338861,0.000000,8886.237503,0.000000,0.000000,790518.367838,26739.996798,392637.619681,0.000000,170897.126636,7479.105856,25645.619233,52787.568714,53050.173847,23978.255141,5048.944674,113671.838662,1.089417e+06,217428.443030,501488.031274,0.000000,6905.471330,137955.596855,0.000000,269985.782167,129138.747335,536115.645320,58814.594023,7.214307e+06,0.000078,10430.904898,58470.340866,44148.812264,6660.390475,5337.305140,0.000000,36145.172415,40954.813217,65518.278247,7.275846e+06,1.301274e+07,1.797610e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331313,362.390948,992.681035,2068.121114,260.349544,316.508499,0.000000,298.312521,0.000000,112.635680,392.738686,50437.660151,4469.028381,92466.053505,117.430747,6044.766148,1035.111273,972.697009,993.158569,1250.776576,943.535093,2870.155451,2049.797486,1.320265e+04,2251.043351,38214.616564,371.768906,937.367092,6934.300288,888.597777,17701.803073,847.845783,15227.675305,3686.257977,6.900475e+02,0.004716,838.881231,938.125302,6682.272450,1052.873953,804.977090,489.496254,4155.716466,1741.168901,5001.590085,1.799795e+05,3.174157e+05,1.902812e+04
331314,372.663608,724.611709,0.000000,0.000000,35.160427,0.000000,829.251021,608.686856,0.000000,0.000000,88656.799304,0.000000,13176.904401,1042.452642,577.555315,1010.994956,853.626242,658.551980,4850.466761,662.469186,0.000000,18824.572962,5.653982e+04,5864.233023,21422.392967,531.357432,2174.363275,6732.846478,879.626762,33613.033575,3089.377985,7933.601307,2556.600206,2.081537e+05,0.004529,597.363041,348.786817,9553.137868,479.746291,2543.308891,360.295789,4413.866547,925.472703,1162.140184,3.706437e+05,6.465320e+05,1.202575e+04
331315,12.442780,0.000000,0.000000,0.000000,77.353153,133.785501,181.986353,254.284758,19.052205,14.854769,3424.966362,85.885916,1172.260957,245.247535,0.000000,138.028785,27.924660,300.020467,648.970828,0.000000,191.587576,231.235996,7.506321e+02,476.929853,2679.400691,124.067507,209.814890,952.209344,173.123581,1926.201105,183.520701,585.860364,271.621016,1.082461e+03,0.025516,118.110472,179.070468,702.866085,818.625625,79.789012,61.177457,159.688245,73.685651,63.775938,2.602718e+04,4.795381e+04,1.629098e+03
331316,59.816495,559.819517,899.679879,448.216709,0.000000,2819.140971,2075.140984,1882.903506,4726.652404,2184.649715,11797.323451,893.115372,111251.297405,1149.331822,7077.478208,33031.760020,0.000000,3585.608417,189.923909,506.207177,630.546302,0.000000,3.442723e+02,1805.169366,29501.166536,673.086272,1867.260163,5871.036522,2330.769157,4417.229336,705.743482,0.000000,648.056076,3.099320e+03,0.001744,575.907007,995.043718,2106.405927,0.000000,0.000000,203.907664,456.791537,4348.993633,2457.059358,7.204364e+05,1.381995e+06,7.844661e+02


In [19]:
#make sure all metadata columns are strings (not numberical as this will run into errors)
# metadata = all_data.filter(['Date','Patient','Culture','gd_donor','Transduction','Treatment','Replicate','Time','Batch','Cell_type'])
# metadata['Batch'] = metadata['Batch'].apply(str)
# metadata['gd_donor'] = metadata['gd_donor'].apply(str)
# metadata['Patient'] = metadata['Patient'].apply(str)
# metadata['Treatment'] = metadata['Treatment'].apply(str)
# metadata['Transduction'] = metadata['Transduction'].apply(str)

metadata = all_events.filter(metadata_columns)
metadata[metadata_columns] = metadata[metadata_columns].applymap(str)
metadata  

Unnamed: 0,Cell_Index,Condition,Control,Replicate
0,1,PDO21 + hIL8 12.5ng,PDO21,02
1,2,PDO21 + hIL8 12.5ng,PDO21,02
2,3,PDO21 + hIL8 12.5ng,PDO21,02
3,4,PDO21 + hIL8 12.5ng,PDO21,02
4,5,PDO21 + hIL8 12.5ng,PDO21,02
...,...,...,...,...
331313,3477,PDO21 + hWnt3a 50ng,PDO21,01
331314,3478,PDO21 + hWnt3a 50ng,PDO21,01
331315,3479,PDO21 + hWnt3a 50ng,PDO21,01
331316,3480,PDO21 + hWnt3a 50ng,PDO21,01


### Select a subset of data (optional)

In [None]:
#Batches:
#Batch 1 = PDO27wt/ko exp B BM/MOPC21/B7C18
#Batch 2 = PDO27 ABCEDF7 Tr
#Batch 3 = PDO27 ABCDEF7 NT
#Batch 4 = PDO21/23/216 ABE7 Tr
#Batch 5 = PDO21/23/216 ABE7 NT 
#Batch 6 = PDO5/11 ABE7 Tr/NT
#Batch 7 = PDO75/99 ABE7 Tr/NT
#Batch 8 = PDO109/141 ABE7 Tr/NT
#Batch 9 = NT/eGFP/eGFP-stIL15 ABE7

#### Configuration

In [None]:
# To enable this process, set this variable to True, False otherwise
should_select_a_subset = False

# Define here the filter to apply
subset_condition = \
    metadata['Patient'].isin(['X','5','11','21','23','27','75','99','109','141','216']) & \
    metadata['gd_donor'].isin(['A','B','E','7']) & \
    metadata['Transduction'].isin(['eGFP-stIL15']) & \
    metadata['Treatment'].isin(['BM','B7C18']) & \
    metadata['Batch'].isin(['Batch2','Batch4','Batch6','Batch7','Batch8'])

#### Select the data

In [None]:
if should_select_a_subset:
    #Select eGFP-stIL15 / ABE7 / wt PDO / BM / B7C18 (I was just selecting the data I wanted to use)
    data = data.loc[subset_condition]
    data

#### Select the metadata

In [None]:
if should_select_a_subset:
    #selecting the corresponding metadata
    metadata = metadata.loc[subset_condition]
    metadata

### Arcsinh transformation

#### Configuration

In [None]:
arcsinh_cofactor = 5

#### Data processing

In [None]:
#arcsinh transformation of all raw data
data = np.arcsinh(data/arcsinh_cofactor)
data

### Batch effect correction

In [None]:
import scprep

# Data centering by batch to correct any cytof batch effect
# Only if 'Batch' is a metadata
if 'Batch' in metadata.columns:
    data = scprep.normalize.batch_mean_center(data,sample_idx=metadata['Batch'])
    data

### Re-assemble processed data with metadata

#### Concatenate data with metadata

In [None]:
# Combine arcsinh-transformed and mean-centered data with metadata again
processed_data = pd.concat([data, metadata], axis=1)
processed_data

#### Re-index the Dataframe

In [None]:
row_count = processed_data.shape[0]
processed_data.index = np.arange(row_count)

#### Ensure type of metadata column to be string

In [None]:

processed_data[metadata_columns] = processed_data[metadata_columns].applymap(str)

# data_as_meta['Date'] = data_as_meta['Date'].apply(str)
# data_as_meta['Patient'] = data_as_meta['Patient'].apply(str)
# data_as_meta['Culture'] = data_as_meta['Culture'].apply(str)
# data_as_meta['gd_donor'] = data_as_meta['gd_donor'].apply(str)
# data_as_meta['Transduction'] = data_as_meta['Transduction'].apply(str)
# data_as_meta['Treatment'] = data_as_meta['Treatment'].apply(str)
# data_as_meta['Replicate'] = data_as_meta['Replicate'].apply(str)
# data_as_meta['Time'] = data_as_meta['Time'].apply(str)
# data_as_meta['Batch'] = data_as_meta['Batch'].apply(str)

### Store the `Condition` information (optional)

#### Configuration

In [None]:
condition_colmns = ['Patient', 'Culture', 'gd_donor', 'Transduction', 'Treatment', 'Batch', 'Date', 'Replicate']

#### Generate the `Condition` column

In [None]:
if 'Condition' not in metadata.columns:
    # Create a condition column for every cell in the experiment
    processed_data['Condition'] = processed_data[condition_colmns].astype(str).agg('_'.join, axis=1)

    # Add `Condition` to the list of metadata columns
    metadata_columns += 'Condition'

processed_data

### Store the `Control` information (optional)

#### Configuration

In [None]:
# All gd monoculture controls including their transduction, treatment and batch.
control_columns = ['gd_donor', 'Transduction', 'Treatment', 'Batch', 'Date']

#### Generate the `Control` column

In [None]:
if 'Control' not in metadata.columns:
    # Define control for pairwise EMD. 
    processed_data['Control'] = "X_gd_" + processed_data[control_columns].astype(str).agg('_'.join, axis=1)

    # Add `Control` to the list of metadata columns
    metadata_columns += 'Control'

processed_data

### Initialise EMD dataframe

#### Compute the markers list

In [None]:
# For each column in the Dataframe, keep only the ones not in the `metadata_columns` variable
markers_list = [col for col in processed_data.columns if col not in metadata_columns]
# marker_list = list(processed_data.columns.values)
markers_list

#### Compute the conditions list

In [None]:
# Get the list of unique conditions
conditions_list = pd.unique(processed_data['Condition'].tolist())

#### Compute the controls list (unused)

In [None]:
# Get the list of unique controls
controls_list = pd.unique(processed_data['Control'].tolist())

#### Create the DataFrame that will receive the EMD values

In [None]:
# Empty df with NaN values to populate with the EMD values
emd_dataframe = pd.DataFrame(
    np.full(
        (len(conditions_list), len(markers_list)), 
        np.nan),
    columns = markers_list,
    index = conditions_list)


### Calculate EMD scores

In [None]:
# Loop over all the conditions
for condition in conditions_list:

    # Dataframe of all events for the condition in the list
    condition_events = processed_data.loc[(processed_data["Condition"] == condition)]

    control_name = condition_events['Control'].values[0]
    print(control_name)

    # Dataframe of all events from the control that will be compared with the events of the current condition
    control_df = processed_data.loc[processed_data["Condition"].str.startswith(control_name)]

    # Loop over all the markers
    for marker in markers_list:

        # Check the sign by using the `median` values
        sign = np.sign(condition_events[marker].median() - control_df[marker].median())

        # In case the median values are equal, use the `mean` instead
        if sign == 0:
            sign = np.sign(condition_events[marker].mean() - control_df[marker].mean())

        # Compute the EMD by multiplying the sign by the EMD score
        emd = scprep.stats.EMD(
            condition_events[marker], 
            control_df[marker]
        )

        # Store the signed EMD in the result Dataframe for the given (condition, marke) pair
        emd_dataframe.loc[condition, marker] = sign * emd

# Ensure that all (condition, marke) pairs have been properly computed
assert not emd_dataframe.isna().values.any()

In [None]:
emd_dataframe