# file 1

In [16]:
import pandas as pd

# Otsu Thresholds
otsu_thresholds = {
    'CD66b': 11.399895365332032,
    'CD56': 3.4436964739765625,
    'CD4': 4.638822374882812,
    'CTLA4': 1.7203896906113283,
    'CD8': 2.993261530101562,
    'CD20': 1.8819881087285157
}

# IsoData Thresholds
isodata_thresholds = {
    'CD66b': 11.508348076880885,
    'CD56': 3.468594647486471,
    'CD4': 4.659877346459998,
    'CTLA4': 1.7343134856874514,
    'CD8': 3.0127165920619516,
    'CD20': 1.885931165886635
}

# GMM Thresholds
gmm_thresholds = {
    'CD66b': 9.627310739032016,
    'CD56': 7.673191841326574,
    'CD4': 7.454448342344508,
    'CTLA4': 1.6143088896363227,
    'CD8': 5.6576234619684165,
    'CD20': 1.414940867048797
}

# Minimum Cross-Entropy Thresholds
cross_entropy_thresholds = {
    'CD66b': 0.013985693,
    'CD56': 3.255864025,
    'CD4': 2.382756981,
    'CTLA4': 1.889522099,
    'CD8': 2.7746521,
    'CD20': 1.802490234
}

# Load the previous output file as the new input file
cells = pd.read_csv('/content/umap_filtered_002_TU2_Immune_2_thresholded_encoded.csv')

# Predefined one-hot mapping (updated for all previous combinations)
one_hot_mapping = {
    'CD20': 1,
    'CD66b': 2,
    'CD8, CD20': 3,
    'CTLA4': 4,
    'CD4': 5,
    'CD4, CTLA4': 6,
    'CD8': 7,
    'CD56': 8,
    'CD4, CD8': 9,
    'CD56, CD4': 10,
    'CTLA4, CD8': 11,
    'CD4, CTLA4, CD8': 12,
    'CD4, CD20': 13,
    'CD56, CD4, CTLA4, CD8': 14,
    'CD20, CD8, CTLA4': 15,
    'CD20, CD56, CD8, CTLA4': 16,
    'CD20, CD8': 17,
    'CD66b, CD8': 18,
    'CD20, CD66b': 19,
    'CD20, CTLA4': 20,
    'CD20, CD66b, CD8': 21,
    'CD56, CD8, CTLA4': 22,
    'CD20, CD56, CD8': 23,
    'CD20, CD56, CTLA4': 24,
    'CD20, CD56': 25,
    'CD20, CD4, CD8': 26,
    'CD8, CTLA4': 27,
    'CD56, CTLA4': 28,
    'CD4, CD56, CTLA4': 29,
    'CD56, CD8': 30,
    'CD20, CD56, CD66b, CD8': 31,
    'CD20, CD4, CTLA4': 32,
    'CD20, CD4, CD56, CD8, CTLA4': 33,
    'CD20, CD56, CD66b, CD8, CTLA4': 34,
    'CD20, CD56, CD66b, CTLA4': 35,
    'CD56, CD66b': 36,
    'CD4, CD56, CD8': 37,
    'CD20, CD4, CD8, CTLA4': 38,
    'CD20, CD4': 39,
    'CD4, CD8, CTLA4': 40,
    'CD4, CD56': 41,
    'CD20, CD4, CD56, CTLA4': 42,
    'CD56, CD66b, CD8': 43,
    'CD20, CD4, CD56': 44,
    'CD20, CD66b, CTLA4': 45,
    'CD20, CD56, CD66b': 46,
    'CD4, CD56, CD8, CTLA4': 47,
    'CD20, CD4, CD56, CD8': 48,
    'CD20, CD4, CD66b, CD8, CTLA4': 49,
    'CD20, CD4, CD56, CD66b, CD8, CTLA4': 50,
    'CD20, CD4, CD66b, CD8': 51,
    'CD20, CD66b, CD8, CTLA4': 52,
    'CD20, CD4, CD66b': 53,
    'CD20, CD4, CD56, CD66b, CD8': 54,
    'CD4, CD66b': 55,
    'CD4, CD56, CD66b, CD8': 56,
    'CD4, CD66b, CTLA4': 57,
    'CD4, CD66b, CD8': 58,
    'CD20, CD4, CD66b, CTLA4': 59,
    'CD4, CD56, CD66b': 60,
    'CD20, CD4, CD56, CD66b, CTLA4': 61,
    'CD66b, CD8, CTLA4': 62,
    'CD4, CD56, CD66b, CTLA4': 63,
    'CD20, CD4, CD56, CD66b': 64,
    'CD4, CD66b, CD8, CTLA4': 65,
    'CD56, CD66b, CTLA4': 66,
    'CD66b, CTLA4': 67,
    'CD4, CD56, CD66b, CD8, CTLA4': 68,
    'CD56, CD66b, CD8, CTLA4': 69
}

# Initialize cluster ID for any new combinations (if needed)
new_cluster_id = max(one_hot_mapping.values()) + 1

# Track any new combinations not in the predefined one-hot mapping
new_combinations = {}

# Function to get a sorted list of expressed markers based on thresholds
def get_expressed_markers(row, thresholds):
    expressed = [marker for marker in thresholds.keys() if row[marker] > thresholds[marker]]
    return expressed

# Function to apply the one-hot encoding based on the predefined mapping or assign a new cluster ID
def one_hot_encode(expressed):
    if not expressed:
        return 0
    expressed_key = ', '.join(sorted(expressed))
    if expressed_key in one_hot_mapping:
        return one_hot_mapping[expressed_key]
    else:
        global new_cluster_id
        new_combinations[expressed_key] = new_cluster_id
        print(f"New combination found: {expressed_key}. Assigning new cluster ID: {new_cluster_id}")
        new_cluster_id += 1
        return new_combinations[expressed_key]

# Process cells for a given thresholding method
def process_cells(cells, thresholds, method_name):
    expressed_markers_col = f'{method_name}_expressed_markers'
    cluster_col = f'{method_name}_cluster'

    # Initialize the new columns
    cells[expressed_markers_col] = ''
    cells[cluster_col] = 0

    for index, row in cells.iterrows():
        expressed = get_expressed_markers(row, thresholds)
        cells.at[index, expressed_markers_col] = ', '.join(expressed)
        cells.at[index, cluster_col] = one_hot_encode(expressed)

    # Save the modified DataFrame
    output_path = f'/content/output/002_TU2_Immune_2_NEW_thresholded_encoded.csv'
    cells.to_csv(output_path, index=False)
    print(f"\nModified CSV saved as {output_path}")

# Process for each method
process_cells(cells, otsu_thresholds, 'otsu')
process_cells(cells, isodata_thresholds, 'isodata')
process_cells(cells, gmm_thresholds, 'gmm')
process_cells(cells, cross_entropy_thresholds, 'cross_entropy')



Modified CSV saved as /content/output/002_TU2_Immune_2_NEW_thresholded_encoded.csv

Modified CSV saved as /content/output/002_TU2_Immune_2_NEW_thresholded_encoded.csv

Modified CSV saved as /content/output/002_TU2_Immune_2_NEW_thresholded_encoded.csv

Modified CSV saved as /content/output/002_TU2_Immune_2_NEW_thresholded_encoded.csv


# file 2

In [18]:
import pandas as pd

# Otsu Thresholds for File 2
otsu_thresholds_2 = {
    'CD66b': 11.526461259472656,
    'CD56': 9.262819014326173,
    'CD4': 2.8504523409902345,
    'CTLA4': 1.933787454142578,
    'CD8': 6.184541287667969,
    'CD20': 1.1763268841464845
}

# IsoData Thresholds for File 2
isodata_thresholds_2 = {
    'CD66b': 16.296615047849976,
    'CD56': 9.377226506620305,
    'CD4': 2.8735494212507433,
    'CTLA4': 1.9447332524313428,
    'CD8': 6.220227254473769,
    'CD20': 1.1617175959772181
}

# GMM Thresholds for File 2
gmm_thresholds_2 = {
    'CD66b': 52.387916334350905,
    'CD56': 18.918180984619028,
    'CD4': 4.931560153011872,
    'CTLA4': 3.1347858918554303,
    'CD8': 7.472530599598466,
    'CD20': 5.566258443848004
}

# Minimum Cross-Entropy Thresholds for File 2
cross_entropy_thresholds_2 = {
    'CD66b': 0.001524847,
    'CD56': 4.812113444,
    'CD4': 1.469428628,
    'CTLA4': 1.758620447,
    'CD8': 3.468239724,
    'CD20': 0.002003243
}

# Load the second file
cells_2 = pd.read_csv('/content/umap_filtered_CC_OC_585_TU1_Immune_1_thresholded_encoded.csv')

# Initialize cluster ID for new combinations
new_cluster_id = max(one_hot_mapping.values()) + 1

# Track any new combinations not in the predefined one-hot mapping
new_combinations = {}

# Function to get a sorted list of expressed markers based on thresholds
def get_expressed_markers(row, thresholds):
    expressed = [marker for marker in thresholds.keys() if row[marker] > thresholds[marker]]
    return expressed

# Function to apply the one-hot encoding based on the predefined mapping or assign a new cluster ID
def one_hot_encode(expressed):
    if not expressed:
        return 0
    expressed_key = ', '.join(sorted(expressed))
    if expressed_key in one_hot_mapping:
        return one_hot_mapping[expressed_key]
    else:
        global new_cluster_id
        new_combinations[expressed_key] = new_cluster_id
        print(f"New combination found: {expressed_key}. Assigning new cluster ID: {new_cluster_id}")
        new_cluster_id += 1
        return new_combinations[expressed_key]

# Process cells for a given thresholding method
def process_cells(cells, thresholds, method_name):
    expressed_markers_col = f'{method_name}_expressed_markers'
    cluster_col = f'{method_name}_cluster'

    # Initialize the new columns
    cells[expressed_markers_col] = ''
    cells[cluster_col] = 0

    for index, row in cells.iterrows():
        expressed = get_expressed_markers(row, thresholds)
        cells.at[index, expressed_markers_col] = ', '.join(expressed)
        cells.at[index, cluster_col] = one_hot_encode(expressed)

    # Save the modified DataFrame
    output_path = f'/content/output/CC_OC_585_TU1_Immune_1_NEW_thresholded_encoded.csv'
    cells.to_csv(output_path, index=False)
    print(f"\nModified CSV saved as {output_path}")

# Process for each method on file 2
process_cells(cells_2, otsu_thresholds_2, 'otsu')
process_cells(cells_2, isodata_thresholds_2, 'isodata')
process_cells(cells_2, gmm_thresholds_2, 'gmm')
process_cells(cells_2, cross_entropy_thresholds_2, 'cross_entropy')



Modified CSV saved as /content/output/CC_OC_585_TU1_Immune_1_NEW_thresholded_encoded.csv

Modified CSV saved as /content/output/CC_OC_585_TU1_Immune_1_NEW_thresholded_encoded.csv

Modified CSV saved as /content/output/CC_OC_585_TU1_Immune_1_NEW_thresholded_encoded.csv

Modified CSV saved as /content/output/CC_OC_585_TU1_Immune_1_NEW_thresholded_encoded.csv


#Below is the methods

# otsu

In [1]:
import numpy as np
import pandas as pd
from skimage.filters import threshold_otsu

def otsu_thresholding(df, marker_columns):
    """
    Apply Otsu's thresholding method to each marker column in the dataset using skimage library.

    Args:
    df (pd.DataFrame): DataFrame containing marker intensity values.
    marker_columns (list): List of marker columns to compute the thresholds for.

    Returns:
    dict: A dictionary with the optimal thresholds for each marker.
    """
    thresholds = {}

    for marker in marker_columns:
        # Get the intensity values for the current marker
        intensities = df[marker].values

        # Use skimage's Otsu method to find the optimal threshold
        optimal_threshold = threshold_otsu(intensities)

        # Store the optimal threshold for the current marker
        thresholds[marker] = optimal_threshold

    return thresholds

# Example usage:
# df = pd.read_csv("your_data.csv")
# marker_columns = ['CD66b', 'CD56', 'CD4', 'CTLA4', 'CD8', 'CD20']
# thresholds = otsu_thresholding(df, marker_columns)
# print(thresholds)


# IsoData

In [2]:
import numpy as np
import pandas as pd

def isodata_thresholding(df, marker_columns, epsilon=1e-5, max_iter=1000):
    """
    Apply the IsoData thresholding method to each marker column in the dataset.

    Args:
    df (pd.DataFrame): DataFrame containing marker intensity values.
    marker_columns (list): List of marker columns to compute the thresholds for.
    epsilon (float): Convergence tolerance for the threshold update.
    max_iter (int): Maximum number of iterations allowed.

    Returns:
    dict: A dictionary with the optimal thresholds for each marker.
    """
    thresholds = {}

    for marker in marker_columns:
        # Get the intensity values for the current marker
        intensities = df[marker].values

        # Initialize the threshold as the mean of the intensities
        threshold = np.mean(intensities)

        for _ in range(max_iter):
            # Partition the cells into two classes based on the current threshold
            C0 = intensities[intensities <= threshold]
            C1 = intensities[intensities > threshold]

            if len(C0) == 0 or len(C1) == 0:
                break

            # Compute the means of the two classes
            mu_0 = np.mean(C0)
            mu_1 = np.mean(C1)

            # Update the threshold as the average of the two class means
            new_threshold = (mu_0 + mu_1) / 2

            # Check for convergence
            if np.abs(new_threshold - threshold) < epsilon:
                break

            # Update the threshold for the next iteration
            threshold = new_threshold

        # Store the final threshold for the current marker
        thresholds[marker] = threshold

    return thresholds

# Example usage:
# df = pd.read_csv("your_data.csv")
# marker_columns = ['CD66b', 'CD56', 'CD4', 'CTLA4', 'CD8', 'CD20']
# thresholds_isodata = isodata_thresholding(df, marker_columns)
# print(thresholds_isodata)


# A Modified Version of GMM

In [7]:
from sklearn.mixture import GaussianMixture
import numpy as np

def modified_gmm_thresholding(data_df, markers, max_components=10, random_state=42):
    thresholds = {}

    for marker in markers:
        # Extract marker values
        marker_values = data_df[[marker]].values

        # Best GMM selection based on BIC
        best_gmm = None
        lowest_bic = np.inf
        best_n_components = 2  # Start with 2 components by default

        # Try different number of components (K)
        for n_components in range(2, max_components + 1):
            gmm = GaussianMixture(n_components=n_components, random_state=random_state)
            gmm.fit(marker_values)
            bic = gmm.bic(marker_values)

            if bic < lowest_bic:
                lowest_bic = bic
                best_gmm = gmm
                best_n_components = n_components

        # Handle K = 2 case (simple thresholding)
        if best_n_components == 2:
            threshold = np.mean(best_gmm.means_)

        # Handle K > 2 case (custom thresholding based on largest gap)
        else:
            means = np.sort(best_gmm.means_.flatten())
            delta_means = np.diff(means)
            k_boundary = np.argmax(delta_means)
            threshold = (means[k_boundary] + means[k_boundary + 1]) / 2

        # Store threshold and create binary column for above threshold
        thresholds[marker] = threshold


    return thresholds

# Example usage:
# data_df = pd.read_csv("your_data.csv")
# markers = ['CD66b', 'CD56', 'CD4', 'CTLA4', 'CD8', 'CD20']
# updated_df, thresholds = apply_gmm_thresholding(data_df, markers)
# print("GMM Thresholds:", thresholds)


# minimum cross-entropy

In [11]:
import numpy as np
from scipy.optimize import minimize
from scipy.stats import norm, lognorm, gamma, expon  # For fitting distributions

# Ensure you have defined or imported the following functions:
# - fit_distribution: to fit the distribution (normal, lognormal, gamma, exponential)
# - cross_entropy: to calculate cross-entropy between distributions

# Example fit_distribution function (you may need to adjust this to your actual implementation)
def fit_distribution(data, dist_name='normal'):
    if dist_name == 'normal':
        return norm(loc=np.mean(data), scale=np.std(data))
    elif dist_name == 'lognormal':
        return lognorm(s=np.std(np.log(data)), scale=np.exp(np.mean(np.log(data))))
    elif dist_name == 'gamma':
        return gamma(a=np.mean(data), scale=np.std(data))
    elif dist_name == 'exponential':
        return expon(scale=np.mean(data))
    else:
        return None

# Example cross_entropy function (you may need to adjust based on your implementation)
def cross_entropy(data, probabilities):
    return -np.sum(np.log(probabilities + 1e-9))  # Add small value to avoid log(0) issues


In [12]:
def min_cross_entropy_thresholding(data_df, marker_columns, candidate_thresholds, dist_name='normal'):
    """
    Apply Minimum Cross-Entropy Thresholding for each marker in the DataFrame.

    Args:
    data_df: DataFrame containing the marker intensity values.
    marker_columns: List of marker columns to apply the thresholding.
    candidate_thresholds: Dictionary of candidate thresholds for each marker.
    dist_name: Distribution to use ('normal', 'lognormal', 'gamma', 'exponential').

    Returns:
    dict: A dictionary with the optimal threshold for each marker.
    """
    thresholds = {}

    for marker in marker_columns:
        data = data_df[marker].values
        unique_thresholds = candidate_thresholds[marker]  # Now correctly referencing marker-specific thresholds

        def objective(threshold):
            non_expressing = data[data <= threshold]
            expressing = data[data > threshold]

            if len(non_expressing) == 0 or len(expressing) == 0:
                return np.inf  # Avoid empty class case

            dist_non_exp = fit_distribution(non_expressing, dist_name)
            dist_exp = fit_distribution(expressing, dist_name)

            if dist_non_exp is None or dist_exp is None:
                return np.inf

            # Compute cross-entropy
            p0 = dist_non_exp.pdf(non_expressing)
            p1 = dist_exp.pdf(expressing)

            ce_non_exp = cross_entropy(non_expressing, p0)
            ce_exp = cross_entropy(expressing, p1)

            return ce_non_exp + ce_exp

        # Use scipy's minimize to find the optimal threshold
        result = minimize(objective, x0=[np.median(data)], bounds=[(min(data), max(data))])

        # Store the best threshold
        thresholds[marker] = result.x[0] if result.success else None

    return thresholds


# generate candiadate thresholds

In [13]:
def generate_candidate_thresholds(df, marker_columns):
    """
    Generate candidate thresholds based on unique intensity values for each marker.

    Args:
    df (pd.DataFrame): DataFrame containing marker intensity values.
    marker_columns (list): List of marker columns to compute the thresholds for.

    Returns:
    dict: A dictionary with marker names as keys and their corresponding unique intensity thresholds as values.
    """
    candidate_thresholds = {}

    for marker in marker_columns:
        # Get the unique intensity values for the current marker, sorted
        unique_intensities = np.sort(df[marker].unique())
        candidate_thresholds[marker] = unique_intensities

    return candidate_thresholds

# Example usage:
# df = pd.read_csv("your_data.csv")
# marker_columns = ['CD66b', 'CD56', 'CD4', 'CTLA4', 'CD8', 'CD20']
# candidate_thresholds = generate_candidate_thresholds(df, marker_columns)
# print(candidate_thresholds)


# load and run data

In [19]:
 import pandas as pd

# Load your data
df = pd.read_csv("/content/umap_filtered_CC_OC_585_TU1_Immune_1_thresholded_encoded.csv")

marker_columns = ['CD66b', 'CD56', 'CD4', 'CTLA4', 'CD8', 'CD20']

thresholds_otsu = otsu_thresholding(df, marker_columns)
print("Otsu Thresholds:", thresholds_otsu)


thresholds_isodata = isodata_thresholding(df, marker_columns)
print("IsoData Thresholds:", thresholds_isodata)

thresholds_gmm = modified_gmm_thresholding(df, marker_columns)
print("GMM Thresholds:", thresholds_gmm)

candidate_thresholds = generate_candidate_thresholds(df, marker_columns)


for marker in marker_columns:
    thresholds_mce = min_cross_entropy_thresholding(df, [marker], candidate_thresholds)
    print(f"Minimum Cross-Entropy Threshold for {marker}: {thresholds_mce}")



FileNotFoundError: [Errno 2] No such file or directory: '/content/umap_filtered_CC_OC_585_TU1_Immune_1_NEW_thresholded_encoded.csv'