# Data Extraction

In [37]:
import pandas as pd
import numpy as np
filelist = ["LTSvsSTS-Data/NU00759_LTSvsSTS.csv", "LTSvsSTS-Data/NU02514_LTSvsSTS.csv", "LTSvsSTS-Data/NU01405_LTSvsSTS.csv", "LTSvsSTS-Data/NU00908_LTSvsSTS.csv"]
thresholddict = {
    "LTSvsSTS-Data/NU00759_LTSvsSTS.csv": {"CD11c_R": 1.62, "CD163_R": 1.39, "CD205_R": 1.74, "CD206_R": 1.69, "CD8_R": 3.91,
                                           "CD4_R": 1.42, "CD103_R": 1.64, "FOXP3_R": 2.2, "GFAP_R": 6.5, "GRZMB_R": 6.42,
                                           "HLADR_R": 2.63, "INFgamma_R": 4.94, "Ki67_R": 2.80, "NFAT1_R": 2.20, "NFAT2_R": 4.49,
                                           "P2RY12_R": 1.43, "PD1_R": 2.12, "PDL1_R": 1.57, "Perforin_R": 3.0, "SOX2_R": 1.48,
                                           "TIM3_R": 1.40, "TNFa_R": 3.78, "cCasp3_R": 1.38, "pLCK_R": 2.55, "pSTAT3_R": 1.42,
                                           "CD68_R": 1.89},
    "LTSvsSTS-Data/NU02514_LTSvsSTS.csv": {"CD11c_R": 1.87, "CD163_R": 1.50,"CD205_R": 1.49, "CD206_R": 1.64, "CD8_R": 1.74, 
                                           "CD4_R": 1.405, "CD103_R": 1.64, "FOXP3_R": 1.506, "GFAP_R": 2.44, "GRZMB_R": 1.60,
                                           "HLADR_R": 1.76, "INFgamma_R": 2.62, "Ki67_R": 1.91, "NFAT1_R": 2.18, "NFAT2_R": 2.00,
                                           "P2RY12_R": 1.54, "PD1_R": 1.55, "PDL1_R": 1.54, "Perforin_R": 1.70, "SOX2_R": 2.00, 
                                           "TIM3_R": 1.63, "TNFa_R": 2.40, "cCasp3_R": 1.42, "pLCK_R": 1.85, "pSTAT3_R": 1.48, 
                                           "CD68_R": 1.74},
    "LTSvsSTS-Data/NU01405_LTSvsSTS.csv": {"CD11c_R": 1.47, "CD163_R": 1.36, "CD205_R": 1.63, "CD206_R": 1.71, "CD8_R": 2.22,
                                           "CD4_R": 1.47, "CD103_R": 1.32, "FOXP3_R": 1.53, "GFAP_R": 3.05, "GRZMB_R": 1.75,
                                           "HLADR_R": 1.80, "INFgamma_R": 1.78, "Ki67_R": 1.90, "NFAT1_R": 1.74, "NFAT2_R": 1.80,
                                           "P2RY12_R": 1.37, "PD1_R": 1.64, "PDL1_R": 1.61, "Perforin_R": 1.73, "SOX2_R": 1.52,
                                           "TIM3_R": 1.37, "TNFa_R": 1.86, "cCasp3_R": 1.32, "pLCK_R": 1.62, "pSTAT3_R": 1.34,
                                           "CD68_R": 1.45},
    "LTSvsSTS-Data/NU00908_LTSvsSTS.csv": {"CD11c_R": 1.50, "CD163_R": 1.25, "CD205_R": 1.42, "CD206_R": 1.49, "CD8_R": 1.96, 
                                           "CD4_R": 1.40, "CD103_R": 1.31, "FOXP3_R": 1.51, "GFAP_R": 2.87, "GRZMB_R": 3.44,
                                           "HLADR_R": 1.62, "INFgamma_R": 1.64, "Ki67_R": 1.80, "NFAT1_R": 2.04, "NFAT2_R": 1.78, 
                                           "P2RY12_R": 1.18, "PD1_R": 1.38, "PDL1_R": 1.40, "Perforin_R": 2.70, "SOX2_R": 1.35,
                                           "TIM3_R": 1.18, "TNFa_R": 2.20, "cCasp3_R": 1.17, "pLCK_R": 1.68, "pSTAT3_R": 1.25,
                                           "CD68_R": 1.53},
}

phenotypedict = {
    # Single marker phenotypes
    "CD11c+": ["CD11c_R"], "CD163+": ["CD163_R"], "CD205+": ["CD205_R"], "CD206+": ["CD206_R"], "CD8+": ["CD8_R"],
    "CD4+": ["CD4_R"], "CD103+": ["CD103_R"], "FOXP3+": ["FOXP3_R"], "GFAP+": ["GFAP_R"], "GRZMB+": ["GRZMB_R"],
    "HLADR+": ["HLADR_R"], "INFgamma+": ["INFgamma_R"], "Ki67+": ["Ki67_R"], "NFAT1+": ["NFAT1_R"], "NFAT2+": ["NFAT2_R"],
    "P2RY12+": ["P2RY12_R"], "PD1+": ["PD1_R"], "PDL1+": ["PDL1_R"], "Perforin+": ["Perforin_R"], "SOX2+": ["SOX2_R"],
    "TIM3+": ["TIM3_R"], "TNFa+": ["TNFa_R"], "cCasp3+": ["cCasp3_R"], "pLCK+": ["pLCK_R"], "pSTAT3+": ["pSTAT3_R"],
    "CD68+": ["CD68_R"],

    # GFAP phenotypes
    "GFAP+Ki67+": ["GFAP_R", "Ki67_R"], "GFAP+SOX2+": ["GFAP_R", "SOX2_R"], "GFAP+cCasp3+": ["GFAP_R", "cCasp3_R"],
    "GFAP+pSTAT3+": ["GFAP_R", "pSTAT3_R"], "GFAP+PD1+": ["GFAP_R", "PD1_R"], "GFAP+TIM3+": ["GFAP_R", "TIM3_R"],

    # Helper T-cell phenotypes 
    "CD4+Ki67+": ["CD4_R", "Ki67_R"], "CD4+pSTAT3+": ["CD4_R", "pSTAT3_R"], "CD4+cCasp3+": ["CD4_R", "cCasp3_R"], 
    "CD4+FOXP3+": ["CD4_R", "FOXP3_R"], "CD4+NFAT1+": ["CD4_R", "NFAT1_R"], "CD4+NFAT2+": ["CD4_R", "NFAT2_R"], 
    "CD4+pLCK+": ["CD4_R", "pLCK_R"], "CD4+NFAT1+pLCK+": ["CD4_R", "NFAT1_R", "pLCK_R"], "CD4+NFAT2+pLCK+": ["CD4_R", "NFAT2_R", "pLCK_R"], 
    "CD4+PD1+": ["CD4_R", "PD1_R"], "CD4+CD103+": ["CD4_R", "CD103_R"],

    # Killer T-cell phenotypes
    "CD8+Ki67+": ["CD8_R", "Ki67_R"], "CD8+pSTAT3+": ["CD8_R", "pSTAT3_R"], "CD8+cCasp3+": ["CD8_R", "cCasp3_R"], 
    "CD8+FOXP3+": ["CD8_R", "FOXP3_R"], "CD8+NFAT1+": ["CD8_R", "NFAT1_R"], "CD8+NFAT2+": ["CD8_R", "NFAT2_R"], 
    "CD8+pLCK+": ["CD8_R", "pLCK_R"], "CD8+NFAT1+pLCK+": ["CD8_R", "NFAT1_R", "pLCK_R"], "CD8+NFAT2+pLCK+": ["CD8_R", "NFAT2_R", "pLCK_R"],
    "CD8+PD1+": ["CD8_R", "PD1_R"], "CD8+CD103+": ["CD8_R", "CD103_R"], "CD8+Perforin+": ["CD8_R", "Perforin_R"], 
    "CD8+GRZMB+": ["CD8_R", "GRZMB_R"],

    # Macrophage/Microglia phenotypes
    "CD68+CD163+": ["CD68_R", "CD163_R"], "CD68+CD163+CD206+": ["CD68_R", "CD163_R", "CD206_R"], "CD163+CD206+": ["CD163_R", "CD206_R"], 
    "CD68+CD11c+": ["CD68_R", "CD11c_R"], "CD11c+CD205+": ["CD11c_R", "CD205_R"], "CD11c+CD103+": ["CD11c_R", "CD103_R"], 
    "CD11c+P2RY12+": ["CD11c_R", "P2RY12_R"], "CD68+CD163+CD11c+": ["CD68_R", "CD163_R", "CD11c_R"],

    # Macrophage/Microglia p-STAT3 phenotypes
    'CD68+CD163+pSTAT3+': ['CD68_R', 'CD163_R', 'pSTAT3_R'],
    'CD68+CD163+CD206+pSTAT3+': ['CD68_R', 'CD163_R', 'CD206_R', 'pSTAT3_R'],
    'CD163+CD206+pSTAT3+': ['CD163_R', 'CD206_R', 'pSTAT3_R'],
    'CD68+CD11c+pSTAT3+': ['CD68_R', 'CD11c_R', 'pSTAT3_R'],
    'CD11c+CD205+pSTAT3+': ['CD11c_R', 'CD205_R', 'pSTAT3_R'],
    'CD11c+CD103+pSTAT3+': ['CD11c_R', 'CD103_R', 'pSTAT3_R'],
    'P2RY12+pSTAT3+': ['P2RY12_R', 'pSTAT3_R'],
    'CD11c+P2RY12+pSTAT3+': ['CD11c_R', 'P2RY12_R', 'pSTAT3_R'],
    'CD68+pSTAT3+': ['CD68_R', 'pSTAT3_R'],
    'CD163+pSTAT3+': ['CD163_R', 'pSTAT3_R'],
    'CD206+pSTAT3+': ['CD206_R', 'pSTAT3_R'],
    'CD11c+pSTAT3+': ['CD11c_R', 'pSTAT3_R'],
    'CD68+CD163+CD11c+pSTAT3+': ['CD68_R', 'CD163_R', 'CD11c_R', 'pSTAT3_R'],

    # Macrophage/Microglia Ki67 phenotypes
    'CD68+CD163+Ki67+': ['CD68_R', 'CD163_R', 'Ki67_R'],
    'CD68+CD163+CD206+Ki67+': ['CD68_R', 'CD163_R', 'CD206_R', 'Ki67_R'],
    'CD163+CD206+Ki67+': ['CD163_R', 'CD206_R', 'Ki67_R'],
    'CD68+CD11c+Ki67+': ['CD68_R', 'CD11c_R', 'Ki67_R'],
    'CD11c+CD205+Ki67+': ['CD11c_R', 'CD205_R', 'Ki67_R'],
    'CD11c+CD103+Ki67+': ['CD11c_R', 'CD103_R', 'Ki67_R'],
    'P2RY12+Ki67+': ['P2RY12_R', 'Ki67_R'],
    'CD11c+P2RY12+Ki67+': ['CD11c_R', 'P2RY12_R', 'Ki67_R'],
    'CD68+Ki67+': ['CD68_R', 'Ki67_R'],
    'CD163+Ki67+': ['CD163_R', 'Ki67_R'],
    'CD206+Ki67+': ['CD206_R', 'Ki67_R'],
    'CD11c+Ki67+': ['CD11c_R', 'Ki67_R'],
    'CD68+CD163+CD11c+Ki67+': ['CD68_R', 'CD163_R', 'CD11c_R', 'Ki67_R'],

    # Macrophage/Microglia cCasp3 phenotypes
    'CD68+CD163+cCasp3+': ['CD68_R', 'CD163_R', 'cCasp3_R'],
    'CD68+CD163+CD206+cCasp3+': ['CD68_R', 'CD163_R', 'CD206_R', 'cCasp3_R'],
    'CD163+CD206+cCasp3+': ['CD163_R', 'CD206_R', 'cCasp3_R'],
    'CD68+CD11c+cCasp3+': ['CD68_R', 'CD11c_R', 'cCasp3_R'],
    'CD11c+CD205+cCasp3+': ['CD11c_R', 'CD205_R', 'cCasp3_R'],
    'CD11c+CD103+cCasp3+': ['CD11c_R', 'CD103_R', 'cCasp3_R'],
    'P2RY12+cCasp3+': ['P2RY12_R', 'cCasp3_R'],
    'CD11c+P2RY12+cCasp3+': ['CD11c_R', 'P2RY12_R', 'cCasp3_R'],
    'CD68+cCasp3+': ['CD68_R', 'cCasp3_R'],
    'CD163+cCasp3+': ['CD163_R', 'cCasp3_R'],
    'CD206+cCasp3+': ['CD206_R', 'cCasp3_R'],
    'CD11c+cCasp3+': ['CD11c_R', 'cCasp3_R'],
    'CD68+CD163+CD11c+cCasp3+': ['CD68_R', 'CD163_R', 'CD11c_R', 'cCasp3_R'],

    # Macrophage/Microglia HLADR phenotypes
    'CD68+CD163+HLADR+': ['CD68_R', 'CD163_R', 'HLADR_R'],
    'CD68+CD163+CD206+HLADR+': ['CD68_R', 'CD163_R', 'CD206_R', 'HLADR_R'],
    'CD163+CD206+HLADR+': ['CD163_R', 'CD206_R', 'HLADR_R'],
    'CD68+CD11c+HLADR+': ['CD68_R', 'CD11c_R', 'HLADR_R'],
    'CD11c+CD205+HLADR+': ['CD11c_R', 'CD205_R', 'HLADR_R'],
    'CD11c+CD103+HLADR+': ['CD11c_R', 'CD103_R', 'HLADR_R'],
    'P2RY12+HLADR+': ['P2RY12_R', 'HLADR_R'],
    'CD11c+P2RY12+HLADR+': ['CD11c_R', 'P2RY12_R', 'HLADR_R'],
    'CD68+HLADR+': ['CD68_R', 'HLADR_R'],
    'CD163+HLADR+': ['CD163_R', 'HLADR_R'],
    'CD206+HLADR+': ['CD206_R', 'HLADR_R'],
    'CD11c+HLADR+': ['CD11c_R', 'HLADR_R'],
    'CD68+CD163+CD11c+HLADR+': ['CD68_R', 'CD163_R', 'CD11c_R', 'HLADR_R'],

    # Macrophage/Microglia INFgamma phenotypes
    'CD68+CD163+INFgamma+': ['CD68_R', 'CD163_R', 'INFgamma_R'],
    'CD68+CD163+CD206+INFgamma+': ['CD68_R', 'CD163_R', 'CD206_R', 'INFgamma_R'],
    'CD163+CD206+INFgamma+': ['CD163_R', 'CD206_R', 'INFgamma_R'],
    'CD68+CD11c+INFgamma+': ['CD68_R', 'CD11c_R', 'INFgamma_R'],
    'CD11c+CD205+INFgamma+': ['CD11c_R', 'CD205_R', 'INFgamma_R'],
    'CD11c+CD103+INFgamma+': ['CD11c_R', 'CD103_R', 'INFgamma_R'],
    'P2RY12+INFgamma+': ['P2RY12_R', 'INFgamma_R'],
    'CD11c+P2RY12+INFgamma+': ['CD11c_R', 'P2RY12_R', 'INFgamma_R'],
    'CD68+INFgamma+': ['CD68_R', 'INFgamma_R'],
    'CD163+INFgamma+': ['CD163_R', 'INFgamma_R'],
    'CD206+INFgamma+': ['CD206_R', 'INFgamma_R'],
    'CD11c+INFgamma+': ['CD11c_R', 'INFgamma_R'],
    'CD68+CD163+CD11c+INFgamma+': ['CD68_R', 'CD163_R', 'CD11c_R', 'INFgamma_R'],

    # Macrophage/Microglia TNFa phenotypes
    'CD68+CD163+TNFa+': ['CD68_R', 'CD163_R', 'TNFa_R'],
    'CD68+CD163+CD206+TNFa+': ['CD68_R', 'CD163_R', 'CD206_R', 'TNFa_R'],
    'CD163+CD206+TNFa+': ['CD163_R', 'CD206_R', 'TNFa_R'],
    'CD68+CD11c+TNFa+': ['CD68_R', 'CD11c_R', 'TNFa_R'],
    'CD11c+CD205+TNFa+': ['CD11c_R', 'CD205_R', 'TNFa_R'],
    'CD11c+CD103+TNFa+': ['CD11c_R', 'CD103_R', 'TNFa_R'],
    'P2RY12+TNFa+': ['P2RY12_R', 'TNFa_R'],
    'CD11c+P2RY12+TNFa+': ['CD11c_R', 'P2RY12_R', 'TNFa_R'],
    'CD68+TNFa+': ['CD68_R', 'TNFa_R'],
    'CD163+TNFa+': ['CD163_R', 'TNFa_R'],
    'CD206+TNFa+': ['CD206_R', 'TNFa_R'],
    'CD11c+TNFa+': ['CD11c_R', 'TNFa_R'],
    'CD68+CD163+CD11c+TNFa+': ['CD68_R', 'CD163_R', 'CD11c_R', 'TNFa_R'],

    # Macrophage/Microglia CD4 phenotypes
    'CD68+CD163+CD4+': ['CD68_R', 'CD163_R', 'CD4_R'],
    'CD68+CD163+CD206+CD4+': ['CD68_R', 'CD163_R', 'CD206_R', 'CD4_R'],
    'CD163+CD206+CD4+': ['CD163_R', 'CD206_R', 'CD4_R'],
    'CD68+CD11c+CD4+': ['CD68_R', 'CD11c_R', 'CD4_R'],
    'CD11c+CD205+CD4+': ['CD11c_R', 'CD205_R', 'CD4_R'],
    'CD11c+CD103+CD4+': ['CD11c_R', 'CD103_R', 'CD4_R'],
    'P2RY12+CD4+': ['P2RY12_R', 'CD4_R'],
    'CD11c+P2RY12+CD4+': ['CD11c_R', 'P2RY12_R', 'CD4_R'],
    'CD68+CD4+': ['CD68_R', 'CD4_R'],
    'CD163+CD4+': ['CD163_R', 'CD4_R'],
    'CD206+CD4+': ['CD206_R', 'CD4_R'],
    'CD11c+CD4+': ['CD11c_R', 'CD4_R'],
    'CD68+CD163+CD11c+CD4+': ['CD68_R', 'CD163_R', 'CD11c_R', 'CD4_R'],

    # Macrophage/Microglia CD8 phenotypes
    'CD68+CD163+CD8+': ['CD68_R', 'CD163_R', 'CD8_R'],
    'CD68+CD163+CD206+CD8+': ['CD68_R', 'CD163_R', 'CD206_R', 'CD8_R'],
    'CD163+CD206+CD8+': ['CD163_R', 'CD206_R', 'CD8_R'],
    'CD68+CD11c+CD8+': ['CD68_R', 'CD11c_R', 'CD8_R'],
    'CD11c+CD205+CD8+': ['CD11c_R', 'CD205_R', 'CD8_R'],
    'CD11c+CD103+CD8+': ['CD11c_R', 'CD103_R', 'CD8_R'],
    'P2RY12+CD8+': ['P2RY12_R', 'CD8_R'],
    'CD11c+P2RY12+CD8+': ['CD11c_R', 'P2RY12_R', 'CD8_R'],
    'CD68+CD8+': ['CD68_R', 'CD8_R'],
    'CD163+CD8+': ['CD163_R', 'CD8_R'],
    'CD206+CD8+': ['CD206_R', 'CD8_R'],
    'CD11c+CD8+': ['CD11c_R', 'CD8_R'],
    'CD68+CD163+CD11c+CD8+': ['CD68_R', 'CD163_R', 'CD11c_R', 'CD8_R'],

    # Macrophage/Microglia TIM3 phenotypes
    'CD68+CD163+TIM3+': ['CD68_R', 'CD163_R', 'TIM3_R'],
    'CD68+CD163+CD206+TIM3+': ['CD68_R', 'CD163_R', 'CD206_R', 'TIM3_R'],
    'CD163+CD206+TIM3+': ['CD163_R', 'CD206_R', 'TIM3_R'],
    'CD68+CD11c+TIM3+': ['CD68_R', 'CD11c_R', 'TIM3_R'],
    'CD11c+CD205+TIM3+': ['CD11c_R', 'CD205_R', 'TIM3_R'],
    'CD11c+CD103+TIM3+': ['CD11c_R', 'CD103_R', 'TIM3_R'],
    'P2RY12+TIM3+': ['P2RY12_R', 'TIM3_R'],
    'CD11c+P2RY12+TIM3+': ['CD11c_R', 'P2RY12_R', 'TIM3_R'],
    'CD68+TIM3+': ['CD68_R', 'TIM3_R'],
    'CD163+TIM3+': ['CD163_R', 'TIM3_R'],
    'CD206+TIM3+': ['CD206_R', 'TIM3_R'],
    'CD11c+TIM3+': ['CD11c_R', 'TIM3_R'],
    'CD68+CD163+CD11c+TIM3+': ['CD68_R', 'CD163_R', 'CD11c_R', 'TIM3_R'],

    # Macrophage/Microglia PDL1 phenotypes
    'CD68+CD163+PDL1+': ['CD68_R', 'CD163_R', 'PDL1_R'],
    'CD68+CD163+CD206+PDL1+': ['CD68_R', 'CD163_R', 'CD206_R', 'PDL1_R'],
    'CD163+CD206+PDL1+': ['CD163_R', 'CD206_R', 'PDL1_R'],
    'CD68+CD11c+PDL1+': ['CD68_R', 'CD11c_R', 'PDL1_R'],
    'CD11c+CD205+PDL1+': ['CD11c_R', 'CD205_R', 'PDL1_R'],
    'CD11c+CD103+PDL1+': ['CD11c_R', 'CD103_R', 'PDL1_R'],
    'P2RY12+PDL1+': ['P2RY12_R', 'PDL1_R'],
    'CD11c+P2RY12+PDL1+': ['CD11c_R', 'P2RY12_R', 'PDL1_R'],
    'CD68+PDL1+': ['CD68_R', 'PDL1_R'],
    'CD163+PDL1+': ['CD163_R', 'PDL1_R'],
    'CD206+PDL1+': ['CD206_R', 'PDL1_R'],
    'CD11c+PDL1+': ['CD11c_R', 'PDL1_R'],
    'CD68+CD163+CD11c+PDL1+': ['CD68_R', 'CD163_R', 'CD11c_R', 'PDL1_R'],
}

distancedict = {"cCasp3+GFAP+ Distance": {"0-100": [0, 100], "100-250": [100, 250], "250-500": [250, 500]},
                "cCasp3+GFAP- Distance": {"0-250": [0, 250], "250-500": [250, 500]},}
    

df = pd.read_csv("LTSvsSTS-Data/NU00759_LTSvsSTS.csv")
print(df.head())


    CD11c_R   CD163_R   CD205_R   CD206_R     CD4_R    CD68_R     CD8_R  \
0  1.718650  1.279580  1.291313  1.320160  1.271361  1.480703  1.792970   
1  1.408775  1.185836  1.162614  1.277001  1.158582  1.323625  1.511998   
2  1.375834  1.215163  1.214893  1.241554  1.201734  1.326974  1.669916   
3  1.291155  1.157090  1.143222  1.193628  1.137302  1.338515  1.589854   
4  1.361338  1.217770  1.226446  1.240057  1.220055  1.607614  2.043101   

     GFAP_R  P2RY12_R    TIM3_R  ...   NFAT2_R     PD1_R    PDL1_R  \
0  6.366225  1.238896  1.300555  ...  1.361937  1.248473  1.211943   
1  1.475472  1.144202  1.177474  ...  1.267683  1.180220  1.136906   
2  2.159774  1.193256  1.216544  ...  1.311968  1.182485  1.176295   
3  1.290747  1.128667  1.168130  ...  1.233378  1.155979  1.111266   
4  1.452176  1.200380  1.216018  ...  1.526071  1.209795  1.177958   

   Perforin_R    pLCK_R  pSTAT3_R    SOX2_R    TNFa_R  cCasp3+GFAP+ Distance  \
0    1.279095  2.016448  1.141481  1.121672  1.6

In [30]:
def threshold_data(df, thresholds):
    for thr in thresholds: 
        df[thr] = df[thr].apply(lambda x: 1 if x >= thresholds[thr] else 0)

def quantify_phenotypes(df, phenotypedict):
    file_counts = []

    phenotype_sums = {phenotype: df[cols].sum(axis=1) for phenotype, cols in phenotypedict.items()}
    for phenotype, sum_column in phenotype_sums.items():

        mask = (sum_column == len(phenotypedict[phenotype]))
            
        file_counts.append(mask.sum())

    return file_counts

def phenotype_matrix(filelist, thresholddict, phenotypedict, proportion=False):
    count_matrix = {}
    # Add and modify row names
    row_names = list(phenotypedict.keys())

    for file in filelist:
        df = pd.read_csv(file)
        # Extract thresholds of that file
        thresholds = thresholddict[file]

        # Threshold data
        threshold_data(df, thresholds)
        
        # Count phenotypes
        file_counts = quantify_phenotypes(df, phenotypedict)

        # Add and modify column names to matrix
        column_name = file.replace("LTSvsSTS-Data/", "").replace("_LTSvsSTS.csv", "")
        if proportion:
            row_count = len(df.index)
            count_matrix[column_name] = np.array(file_counts) / row_count * 100
        else:
            count_matrix[column_name] = file_counts

    return pd.DataFrame(count_matrix, index=row_names)

def count_matrix(filelist):
    matrix = {}
    row_names = ["Cells"]

    for file in filelist:
        df = pd.read_csv(file)
        row_count = len(df.index)

        column_name = file.replace("LTSvsSTS-Data/", "").replace("_LTSvsSTS.csv", "")
        matrix[column_name] = row_count

    return pd.DataFrame(matrix, index=row_names)

print(phenotype_matrix(filelist, thresholddict, phenotypedict, proportion=False))
print(phenotype_matrix(filelist, thresholddict, phenotypedict, proportion=True))
print(count_matrix(filelist))

                   NU00759  NU02514  NU01405  NU00908
CD11c+                8761    12725    15041     6340
CD163+                7073      766      910      111
CD205+                  86      563      262       91
CD206+                7752     1442    19068      615
CD8+                    71       68       22       48
...                    ...      ...      ...      ...
CD68+CD11c+           2769       92     1967      992
CD11c+CD205+            43      151       36       77
CD11c+CD103+          2510      154     2226      702
CD11c+P2RY12+          182     4969    13760     5067
CD68+CD163+CD11c+     2237       45      315       33

[64 rows x 4 columns]
                     NU00759    NU02514    NU01405    NU00908
CD11c+             32.151639  25.338006  22.065252  10.640262
CD163+             25.956916   1.525258   1.334976   0.186288
CD205+              0.315608   1.121045   0.384356   0.152723
CD206+             28.448750   2.871309  27.972890   1.032139
CD8+               

In [38]:
def quantify_phenotypes_distance(df, phenotypedict, distancedict, metric):
    file_counts = []
    distances = distancedict[metric]

    phenotype_sums = {phenotype: df[cols].sum(axis=1) for phenotype, cols in phenotypedict.items()}
    for phenotype, sum_column in phenotype_sums.items():
        for distance, bounds in distances.items():

            mask = (sum_column == len(phenotypedict[phenotype])) & (df[metric] >= bounds[0]) & (df[metric] < bounds[1])
            file_counts.append(mask.sum())

    return file_counts

def phenotype_distance_matrix(filelist, thresholddict, phenotypedict, distancedict, metric, proportion=False):
    matrix = {}

    row_names = []
    for phenotype in phenotypedict:
        for distance in distancedict[metric]:
            row_names.append(phenotype + " " + distance)

    for file in filelist:
        df = pd.read_csv(file)
        thresholds = thresholddict[file]

        threshold_data(df, thresholds)


        file_counts = quantify_phenotypes_distance(df, phenotypedict, distancedict, metric)
        column_name = file.replace("LTSvsSTS-Data/", "").replace("_LTSvsSTS.csv", "")


        if proportion:
            row_count = []
            for distance, bounds in distancedict[metric].items():
                mask = (df[metric] >= bounds[0]) & (df[metric] < bounds[1])
                row_count.append(mask.sum())

            matrix[column_name] = (np.array(file_counts) / np.tile(np.array(row_count), len(file_counts) // len(row_count))) * 100
        else:
            matrix[column_name] = file_counts

    return pd.DataFrame(matrix, index=row_names)

def distance_matrix(filelist, distancedict, metric):
    matrix = {}
    row_names = distancedict[metric].keys()

    for file in filelist:
        df = pd.read_csv(file)

        file_counts = []
        for distance, bounds in distancedict[metric].items():
            mask = (df[metric] >= bounds[0]) & (df[metric] < bounds[1])
            file_counts.append(mask.sum())

        column_name = file.replace("LTSvsSTS-Data/", "").replace("_LTSvsSTS.csv", "")
        matrix[column_name] = file_counts

    return pd.DataFrame(matrix, index=row_names)
        
print(phenotype_distance_matrix(filelist, thresholddict, phenotypedict, distancedict, metric="cCasp3+GFAP+ Distance", proportion=False))
print(phenotype_distance_matrix(filelist, thresholddict, phenotypedict, distancedict, metric="cCasp3+GFAP+ Distance", proportion=True))
print(distance_matrix(filelist, distancedict, metric="cCasp3+GFAP+ Distance"))

                           NU00759  NU02514  NU01405  NU00908
CD11c+ 0-100                  1930     3905    12297     5550
CD11c+ 100-250                2879     4828      303      245
CD11c+ 250-500                1837     2651       15        0
CD163+ 0-100                  1647      256      723       85
CD163+ 100-250                2617      319       19       11
...                            ...      ...      ...      ...
CD11c+P2RY12+ 100-250           93     1630      280      207
CD11c+P2RY12+ 250-500            9      794       15        0
CD68+CD163+CD11c+ 0-100        412       11      220       32
CD68+CD163+CD11c+ 100-250      649       21        9        0
CD68+CD163+CD11c+ 250-500      566       11        0        0

[192 rows x 4 columns]
                             NU00759    NU02514    NU01405    NU00908
CD11c+ 0-100               38.022065  28.553671  21.428945  10.786753
CD11c+ 100-250             41.484150  26.582975  24.162679   8.623724
CD11c+ 250-500        