In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "48" # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "48" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "48" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "48" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "48" # export NUMEXPR_NUM_THREADS=6


from umap import UMAP
import umap.plot

import flowkit as fk
import FlowCal as fc
import seaborn as sns
import bokeh
from bokeh.plotting import show
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import pandas as pd
import numpy as np
import skbio
import sys
import os
import glob
import joypy

import hdbscan
from sklearn.cluster import KMeans
from numba import njit, set_num_threads
from sklearn.preprocessing import RobustScaler, MinMaxScaler
set_num_threads(48)

bokeh.io.output_notebook()
%matplotlib inline

_ = plt.ioff()

main_dir = "/home/n10853499/01-projects/00-allison_microscope/" # change this to be the top folder where the flow cyto data is stored

2022-09-20 09:27:30.604525: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-20 09:27:30.604572: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:

def reference_cultures(files = [main_dir + 'reference_cultures/Pro_and_syn/fixed/PFA_40_Prochlorococcus_MIT9312.fcs', main_dir + "reference_cultures/Pro_and_syn/fixed/PFA_15_Synechococcus_PH4_0_D8.fcs"]):
    logicle_xform = fk.transforms.AsinhTransform("asinh", param_t=262144, param_m=4.0, param_a=0.0)    

    # gathering the training data
    main_df = []
    labels = []
    
    strains = {}
    strain_index = 1
    
    species_map = {}
    species_index = 1
    
    exps = {0: 0, 5: 1, 15: 2, 40: 3}
    exp_index = 0
    
    for file in files:
        name = file.split("/")[-1].strip(".fcs")
        light_exp = 0
        strain = ""
        species = ""
        fixed = False
        dilution = 0

         # skip non-slow speeds
        if any(speed in name for speed in ["high", "med"]):
            continue

        # get dilution level
        if "diluted" in name:
            if "diluted heaps" in name or "diluted heap" in name:
                dilution = 2
                name = name.strip(" diluted heaps")
                name = name.strip(" diluted heap")
            else:
                dilution = 1
                name = name.strip("_diluted")
                name = name.strip(" diluted")
                    
        name = name.split("_")
        print(name)
        if "CAS85pct" in name:
            species = "Picoeukaryote"
            strain = "_".join(name[1:])
        # get light exposure and strain name
        elif "LightExp" in name or "PFA" in name:
            if "PFA" in name:
                fixed = True

            light_exp = int(name[1])
            # rename badly named files
            if name[2] == "Prochloroccocus" or name[2] == "Prochloroccocus":
                name[2] = "Prochlorococcus"
            elif name[2] == "Synecoccocus" or name[2] == "Synechoccocus":
                name[2] = "Synechococcus"
            if "MARCIA" in name:
                name[0] = "Synechococcus_MARCIA"
            strain = "_".join(name[2:])
            species = name[2]
        else:
            # rename badly named files
            if name[0] == "Prochloroccocus" or name[0] == "Prochloroccocus":
                name[0] = "Prochlorococcus"
            elif name[0] == "Synecoccocus" or name[0] == "Synechoccocus":
                name[0] = "Synechococcus"
            if "MARCIA" in name:
                name[0] = "Synechococcus_MARCIA"

            name[1] = "_".join(name[1].split(" ")).upper()
            strain = "_".join(name)
            species = name[0]
        
        print(species, strain)
        if ((strain in ["Prochlorococcus_CC9605", "Synechococcus_BBP2", "Ostreococcus_TAURI_RCC4221", "Synechococcus_CC9311", "Synechococcus_MARCIA_R1"])
            or (strain in ["Synechococcus_WH8016"] and fixed)
            # or (strain in ["Prochlorococcus_MIT9313"] and not fixed)
            or (strain in ["Prochlorococcus_NATL2A"] and light_exp == 40) 
            or (strain in ["Prochlorococcus_SS120"] and light_exp in [5, 15])
            or (strain in ["Synechococcus_MITS9220"] and light_exp in [0] and dilution in [0])): # potentially contaminated?
            continue
        
        fk_file = fk.Sample(file, subsample=50000)
        fk_file.pns_labels = fk_file.pnn_labels

        fk_file.apply_transform(logicle_xform)
        s_g1 = fc.gate.density2d(fk_file.as_dataframe(source="xform", subsample=True)[['FSC-H', 'SSC-H']].values, gate_fraction=0.75, full_output=True)
        mask = s_g1.mask

        df = fk_file.as_dataframe(source='xform', subsample=True)
        
        df.columns = df.columns.get_level_values(0)
        to_select = [("-A" in col or ("SC-" in col and "-W" not in col)) for col in df.columns]

        
        print("Mask: ", mask.shape, mask.sum(), mask)
        
        df = df.loc[s_g1.mask, to_select]
        to_filter = scatter_filter(df.loc[:, :], use_size=True, use_iqr=True)
        print("To filter: ", to_filter.shape, to_filter.sum(), to_filter)

        mask = ~to_filter
        print("Remaining shape: ", df.loc[mask, :].shape)

        df['fixed'] = fixed
        df['dilution'] = dilution
        # convert strain to int index
        try:
            df.loc[mask, 'strain'] = strains[strain]
            df.loc[mask, 'strain_name'] = strain
            df.loc[mask, 'species'] = species_map[species]
        except KeyError:
            strains[strain] = strain_index
            strain_index += 1
            
            df.loc[mask, 'strain'] = strains[strain]
            df.loc[mask, 'strain_name'] = strain
        
        try:
            df.loc[mask, 'species'] = species_map[species]
        except KeyError:
            species_map[species] = species_index
            species_index += 1
            df.loc[mask, 'species'] = species_map[species]
            
        
        try:
            df['light_exp'] = exps[light_exp]
        except KeyError:
            exps[light_exp] = exp_index
            exp_index += 1
            df['light_exp'] = exps[light_exp]
        
        labels.append(df.loc[mask, ["-" not in col for col in df.columns]])
        main_df.append(df.loc[mask, ["-" in col for col in df.columns]])

    main_df = pd.concat(main_df)

    min_df = main_df[['FSC-A', 'SSC-A', 'SSC-B-A', 'FSC-H', 'SSC-H', 'SSC-B-H']].min()

    main_df[['FSC-A', 'SSC-A', 'SSC-B-A', 'FSC-H', 'SSC-H', 'SSC-B-H']] = main_df[['FSC-A', 'SSC-A', 'SSC-B-A', 'FSC-H', 'SSC-H', 'SSC-B-H']] + abs(min_df)

    main_df.reset_index(inplace=True)
    labels = pd.concat(labels)
    labels.reset_index(inplace=True)

    subsampled_df = pd.concat([main_df, labels], axis=1)
    subsampled_df = subsampled_df.groupby(['species']).sample(n=subsampled_df.groupby(['species']).count().iloc[:, 0].min())
    subsampled_df.pop('index')
    to_select = [("-A" in col or ("SC-" in col and "-W" not in col)) for col in subsampled_df.columns]
    scaler = MinMaxScaler()
    scaled_df = scaler.fit_transform(subsampled_df.loc[:, to_select])
    
    return scaled_df, min_df, subsampled_df

def scatter_filter(df, use_size=True, use_iqr=True, size_tolerance=2, iqr_tolerance=1.5):

    to_filter = np.array([False for _ in range(df.shape[0])])
    
    if use_size:
        min_a = df[['FSC-A', 'SSC-A']].min()
        min_h = df[['FSC-H', 'SSC-H']].min()
        scatter_ratio = (df[['FSC-A', 'SSC-A']].abs()).values / (df[['FSC-H', 'SSC-H']].abs() + 10e-6).values
        print(scatter_ratio.mean(axis=0))
        scatter_array = np.concatenate([scatter_ratio > size_tolerance, scatter_ratio < 1/size_tolerance], axis=1)
        print(scatter_array.sum(axis=0))
        to_filter = scatter_array.any(axis=1)

    if use_iqr:
        iqr_df = df.copy()
        iqr_df += abs(iqr_df.min())
        for col in iqr_df.columns:
            Q3 = np.quantile(iqr_df[col], 0.75)
            Q1 = np.quantile(iqr_df[col], 0.25)
            IQR = Q3 - Q1

            # print("IQR value for column %s is: %s" % (col, IQR))
            lower_range = Q1 - iqr_tolerance * IQR
            upper_range = Q3 + iqr_tolerance * IQR
            to_filter += [((x < lower_range) or (x > upper_range)) for x in iqr_df[col]]
    
    return to_filter
    


In [None]:
# read in reference culture data
s_df, min_df, unscaled_df = reference_cultures(
    glob.glob(main_dir + "reference_cultures/Pro_and_syn/fixed/final/*cus*fcs") + 
    glob.glob(main_dir + "reference_cultures/Pro_and_syn/fresh/*cus*fcs") + 
    glob.glob(main_dir + "reference_cultures/Algae/final/*fcs")
)

In [None]:
def read_transect_data(paths=None, minima=min_df):
    logicle_xform = fk.transforms.AsinhTransform("asinh", param_t=262144, param_m=4.0, param_a=0.0)    
    scaler = MinMaxScaler()

    main_df = []
    unscaled_df = []
    main_df_labels = []
    sample_idx = 0
    for file in paths:
        print(f"Processing {file}")
        if "fcs" not in file: 
            print(f"Skipping {file}")
            continue
        
        name = file.split("/")[-1].strip(".fcs")
        
        if any([check in name.lower() for check in ["test", "beads"]]): 
            print(f"Skipping {file}")
            continue
        
        fk_file = fk.Sample(file, subsample=25000) # increase the subsample value if you want to include more cells in each transect
        fk_file.pns_labels = fk_file.pnn_labels
        fk_file.apply_transform(logicle_xform)
        df = fk_file.as_dataframe(source='xform', subsample=True)
        df.columns = df.columns.get_level_values(0)
        to_select = [("-A" in col or ("SC-" in col and "-W" not in col)) for col in df.columns]
        
        to_filter = scatter_filter(df, use_iqr=False, size_tolerance=3)
        mask = to_filter
        df = df.loc[~mask, to_select]
        df['sample'] = name.lower().replace(" ", "_")
        df['sample_idx'] = sample_idx
        df[['FSC-A', 'SSC-A', 'SSC-B-A', 'FSC-H', 'SSC-H', 'SSC-B-H']] = df[['FSC-A', 'SSC-A', 'SSC-B-A', 'FSC-H', 'SSC-H', 'SSC-B-H']] + abs(minima)
        sample_idx += 1
        unscaled_df.append(df.loc[:, ["-" in col for col in df.columns]])
        scaled_df = scaler.fit_transform(df.loc[:, ["-" in col for col in df.columns]])
        
        main_df.append(scaled_df)
        main_df_labels.append(df.loc[:, ["-" not in col for col in df.columns]])

    main_df = np.concatenate(main_df, axis=0)
    unscaled_df = pd.concat(unscaled_df)
    unscaled_df.reset_index(inplace=True)
    
    
    
    labels = pd.concat(main_df_labels)
    labels['transect_label'] = labels['sample'].apply(lambda s: int(s.split("_")[2].strip('ft')))
    labels.reset_index(inplace=True)
    return main_df, labels, unscaled_df
    

In [None]:
transect_data, transect_labels, transect_unscaled = read_transect_data(
    glob.glob(main_dir + "/transect_data/trimmed/*.fcs"),
    min_df
)

In [None]:
# UMAP embedding for transect data

embedder_transect = umap.UMAP(n_neighbors=100, a=1.48, b=0.4)
embedder_transect.fit(transect_data)

In [None]:
# Set the TRUTH clusters for the UMAP data. Check that these clusters look right via the plot

transect_labels['Pro'] = np.array([embedder_transect.embedding_[:, 0] < 0, embedder_transect.embedding_[:, 1] < 10]).all(axis=0)
transect_labels['Syn'] = np.array([embedder_transect.embedding_[:, 0] > 10, embedder_transect.embedding_[:, 1] < 10]).all(axis=0)
transect_labels['Pico'] = np.array([embedder_transect.embedding_[:, 0] > 5, embedder_transect.embedding_[:, 1] > 15]).all(axis=0)

transect_labels['Truth'] = 'Debris'
transect_labels['Truth'].iloc[transect_labels['Pro'] == True] = 'Pro'
transect_labels['Truth'].iloc[transect_labels['Pico'] == True] = 'Pico'
transect_labels['Truth'].iloc[transect_labels['Syn'] == True] = 'Syn'
# pd.concat([pd.DataFrame(transect_data), transect_labels[['Truth', 'transect_label']].reset_index()], axis=1).drop("index", axis=1).to_csv("00-jupyter/10-allison_umap/transect_labels_umap.tsv", sep='\t', index=False)
print(transect_labels.groupby(['Pro', 'Syn', 'Pico', 'Debris']).count())
umap.plot.points(embedder_transect, labels=transect_labels['Truth'], theme="fire", alpha=0.5)
plt.show()

In [None]:
# Biplot of above clutering

sns.scatterplot(
    data=plot_transect,
    x = "YG1-A",
    y = "B8-A",
    hue = "Truth",
    alpha= 0.1
)
plt.show()

In [None]:
# Correlation heatmap of truth clusters

transect_joy_df = pd.concat([pd.DataFrame(transect_data), transect_labels[['Truth', 'transect_label']].reset_index()], axis=1).drop("index", axis=1).groupby(['Truth', 'transect_label']).mean().transpose()
sns.heatmap(transect_joy_df.corr())
plt.show()

In [None]:
# set species clusters to -1 before we do internal clustering

transect_labels['cluster'] = -1

In [None]:
# Prochlorococcus embedding
species_data = transect_data[(transect_labels["Truth"]=="Pro").values]
embedder_pro = umap.UMAP(n_neighbors=100, a=1.48, b=0.4)
embedder_pro.fit(species_data)
clusterer_pro = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=100, gen_min_span_tree=True, prediction_data=True)
clusterer_pro.fit(embedder_pro.embedding_)

transect_labels['cluster'][transect_labels["Truth"] == "Pro"] = [f"pro_{c}" for c in clusterer_pro.labels_]
umap.plot.points(embedder_pro, labels=clusterer_pro.labels_, theme="fire", alpha=0.5)
plt.show()

In [None]:
embedder_syn = umap.UMAP(n_neighbors=100, a=1.48, b=0.4)
embedder_syn.fit(transect_data[(transect_labels["Truth"]=="Syn").values])
clusterer_syn = hdbscan.HDBSCAN(min_cluster_size=100, gen_min_span_tree=True)
clusterer_syn.fit(embedder_syn.embedding_)

# umap.plot.points(embedder_syn, labels=transect_labels[transect_labels['Truth']=="Syn"]["transect_label"].values, theme="fire", alpha=0.5)
umap.plot.points(embedder_syn, labels=clusterer_syn.labels_, theme="fire", alpha=0.5)
transect_labels['cluster'][transect_labels["Truth"] == "Syn"] = [f"syn_{c}" for c in clusterer_syn.labels_]
plt.show()

In [None]:
embedder_pico = umap.UMAP(n_neighbors=100, a=1.48, b=0.4)
embedder_pico.fit(transect_data[(transect_labels["Truth"]=="Pico").values])
clusterer_pico = hdbscan.HDBSCAN(min_cluster_size=200, gen_min_span_tree=True)
clusterer_pico.fit(embedder_pico.embedding_)

# umap.plot.points(embedder_pico, labels=transect_labels[transect_labels['Truth']=="Pico"]["transect_label"].values, theme="fire", alpha=0.1)
umap.plot.points(embedder_pico, labels=clusterer_pico.labels_, theme="fire", alpha=0.1)
transect_labels['cluster'][transect_labels["Truth"] == "Pico"] = [f"pico_{c}" for c in clusterer_pico.labels_]
plt.show()

In [None]:
embedder_debris = umap.UMAP(n_neighbors=100, a=1.48, b=0.4)
embedder_debris.fit(transect_data[(transect_labels["Truth"]=="Debris").values])

clusterer_debris = hdbscan.HDBSCAN(min_cluster_size=200, gen_min_span_tree=True)
clusterer_debris.fit(embedder_debris.embedding_)

# umap.plot.points(embedder_debris, labels=transect_labels[transect_labels['Truth']=="Debris"]["transect_label"].values, theme="fire", alpha=0.5)
umap.plot.points(embedder_debris, labels=clusterer_debris.labels_, theme="fire", alpha=0.5)
transect_labels['cluster'][transect_labels["Truth"] == "Debris"] = [f"debris_{c}" for c in clusterer_debris.labels_]
plt.show()

In [None]:
# write transect labels to file. Change directory to your current directory

transect_labels.to_csv("00-jupyter/10-allison_umap/transect_labels_clustered.tsv", sep='\t', index=False)