# Data Extraction

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv("LTSvsSTS-Data/NU00759_LTSvsSTS.csv")
print(df.head())


    CD11c_R   CD163_R   CD205_R   CD206_R     CD4_R    CD68_R     CD8_R  \
0  1.718650  1.279580  1.291313  1.320160  1.271361  1.480703  1.792970   
1  1.408775  1.185836  1.162614  1.277001  1.158582  1.323625  1.511998   
2  1.375834  1.215163  1.214893  1.241554  1.201734  1.326974  1.669916   
3  1.291155  1.157090  1.143222  1.193628  1.137302  1.338515  1.589854   
4  1.361338  1.217770  1.226446  1.240057  1.220055  1.607614  2.043101   

     GFAP_R  P2RY12_R    TIM3_R  ...   NFAT2_R     PD1_R    PDL1_R  \
0  6.366225  1.238896  1.300555  ...  1.361937  1.248473  1.211943   
1  1.475472  1.144202  1.177474  ...  1.267683  1.180220  1.136906   
2  2.159774  1.193256  1.216544  ...  1.311968  1.182485  1.176295   
3  1.290747  1.128667  1.168130  ...  1.233378  1.155979  1.111266   
4  1.452176  1.200380  1.216018  ...  1.526071  1.209795  1.177958   

   Perforin_R    pLCK_R  pSTAT3_R    SOX2_R    TNFa_R  cCasp3+GFAP+ Distance  \
0    1.279095  2.016448  1.141481  1.121672  1.6

In [20]:
def threshold_counts(filelist, thresholddict, phenotypedict):
    count_matrix = {}
    # Add and modify row names
    row_names = list(phenotypedict.keys())

    for file in filelist:
        df = pd.read_csv(file)
        # Extract thresholds of that file
        thresholds = thresholddict[file]

        # Threshold data
        for thr in thresholds: 
            # Thresholding
            df[thr] = df[thr].apply(lambda x: 1 if x >= thresholds[thr] else 0)
        
        # Count phenotypes
        file_counts = []
        for phenotype in phenotypedict:

            df['sum'] = df[phenotypedict[phenotype]].sum(axis=1)
            count = df['sum'].apply(lambda x: 1 if x == len(phenotypedict[phenotype]) else 0)
            
            file_counts.append(count.sum())


        # Add and modify column names to matrix
        column_name = file.replace("LTSvsSTS-Data/", "").replace("_LTSvsSTS.csv", "")
        count_matrix[column_name] = file_counts

    return pd.DataFrame(count_matrix, index=row_names)

filelist = ["LTSvsSTS-Data/NU00759_LTSvsSTS.csv", "LTSvsSTS-Data/NU02514_LTSvsSTS.csv", "LTSvsSTS-Data/NU01405_LTSvsSTS.csv", "LTSvsSTS-Data/NU00908_LTSvsSTS.csv"]
thresholddict = {
    "LTSvsSTS-Data/NU00759_LTSvsSTS.csv": {"CD11c_R": 1.62, "CD163_R": 1.39, "CD205_R": 1.74, "CD206_R": 1.69, "CD8_R": 3.91,
                                           "CD4_R": 1.42, "CD103_R": 1.64, "FOXP3_R": 2.2, "GFAP_R": 6.5, "GRZMB_R": 6.42,
                                           "HLADR_R": 2.63, "INFgamma_R": 4.94, "Ki67_R": 2.80, "NFAT1_R": 2.20, "NFAT2_R": 4.49,
                                           "P2RY12_R": 1.43, "PD1_R": 2.12, "PDL1_R": 1.57, "Perforin_R": 3.0, "SOX2_R": 1.48,
                                           "TIM3_R": 1.40, "TNFa_R": 3.78, "cCasp3_R": 1.38, "pLCK_R": 2.55, "pSTAT3_R": 1.42,
                                           "CD68_R": 1.89},
    "LTSvsSTS-Data/NU02514_LTSvsSTS.csv": {"CD11c_R": 1.87, "CD163_R": 1.50,"CD205_R": 1.49, "CD206_R": 1.64, "CD8_R": 1.74, 
                                           "CD4_R": 1.405, "CD103_R": 1.64, "FOXP3_R": 1.506, "GFAP_R": 2.44, "GRZMB_R": 1.60,
                                           "HLADR_R": 1.76, "INFgamma_R": 2.62, "Ki67_R": 1.91, "NFAT1_R": 2.18, "NFAT2_R": 2.00,
                                           "P2RY12_R": 1.54, "PD1_R": 1.55, "PDL1_R": 1.54, "Perforin_R": 1.70, "SOX2_R": 2.00, 
                                           "TIM3_R": 1.63, "TNFa_R": 2.40, "cCasp3_R": 1.42, "pLCK_R": 1.85, "pSTAT3_R": 1.48, 
                                           "CD68_R": 1.74},
    "LTSvsSTS-Data/NU01405_LTSvsSTS.csv": {"CD11c_R": 1.47, "CD163_R": 1.36, "CD205_R": 1.63, "CD206_R": 1.71, "CD8_R": 2.22,
                                           "CD4_R": 1.47, "CD103_R": 1.32, "FOXP3_R": 1.53, "GFAP_R": 3.05, "GRZMB_R": 1.75,
                                           "HLADR_R": 1.80, "INFgamma_R": 1.78, "Ki67_R": 1.90, "NFAT1_R": 1.74, "NFAT2_R": 1.80,
                                           "P2RY12_R": 1.37, "PD1_R": 1.64, "PDL1_R": 1.61, "Perforin_R": 1.73, "SOX2_R": 1.52,
                                           "TIM3_R": 1.37, "TNFa_R": 1.86, "cCasp3_R": 1.32, "pLCK_R": 1.62, "pSTAT3_R": 1.34,
                                           "CD68_R": 1.45},
    "LTSvsSTS-Data/NU00908_LTSvsSTS.csv": {"CD11c_R": 1.50, "CD163_R": 1.25, "CD205_R": 1.42, "CD206_R": 1.49, "CD8_R": 1.96, 
                                           "CD4_R": 1.40, "CD103_R": 1.31, "FOXP3_R": 1.51, "GFAP_R": 2.87, "GRZMB_R": 3.44,
                                           "HLADR_R": 1.62, "INFgamma_R": 1.64, "Ki67_R": 1.80, "NFAT1_R": 2.04, "NFAT2_R": 1.78, 
                                           "P2RY12_R": 1.18, "PD1_R": 1.38, "PDL1_R": 1.40, "Perforin_R": 2.70, "SOX2_R": 1.35,
                                           "TIM3_R": 1.18, "TNFa_R": 2.20, "cCasp3_R": 1.17, "pLCK_R": 1.68, "pSTAT3_R": 1.25,
                                           "CD68_R": 1.53},
}

phenotypedict = {
    # Single marker phenotypes
    "CD11c+": ["CD11c_R"], "CD163+": ["CD163_R"], "CD205+": ["CD205_R"], "CD206+": ["CD206_R"], "CD8+": ["CD8_R"],
    "CD4+": ["CD4_R"], "CD103+": ["CD103_R"], "FOXP3+": ["FOXP3_R"], "GFAP+": ["GFAP_R"], "GRZMB+": ["GRZMB_R"],
    "HLADR+": ["HLADR_R"], "INFgamma+": ["INFgamma_R"], "Ki67+": ["Ki67_R"], "NFAT1+": ["NFAT1_R"], "NFAT2+": ["NFAT2_R"],
    "P2RY12+": ["P2RY12_R"], "PD1+": ["PD1_R"], "PDL1+": ["PDL1_R"], "Perforin+": ["Perforin_R"], "SOX2+": ["SOX2_R"],
    "TIM3+": ["TIM3_R"], "TNFa+": ["TNFa_R"], "cCasp3+": ["cCasp3_R"], "pLCK+": ["pLCK_R"], "pSTAT3+": ["pSTAT3_R"],
    "CD68+": ["CD68_R"],
}

print(threshold_counts(filelist, thresholddict, phenotypedict))

           NU00759  NU02514  NU01405  NU00908
CD11c+        8761    12725    15041     6340
CD163+        7073      766      910      111
CD205+          86      563      262       91
CD206+        7752     1442    19068      615
CD8+            71       68       22       48
CD4+           489      831       97      169
CD103+        3246      273     2810      819
FOXP3+           0        9        2        4
GFAP+         3777     9761    33479    31502
GRZMB+           1        8        0        0
HLADR+        1706     1800      629     4759
INFgamma+        4        1       18       18
Ki67+          819     3403      629      681
NFAT1+          38       11        2        3
NFAT2+           1        0        0        5
P2RY12+        200     6260    20751     8843
PD1+            21       33       11       13
PDL1+           27      103      399       27
Perforin+        0        0        0        0
SOX2+          277        0        3       10
TIM3+         7560     9178     98