In [None]:
#####The spatial function values generated using R from cell assignment under dir /mnt/data10/shared/yujie/DCIS/ANALYSIS/IHC_communityAssignment
#####And hull JSons under /mnt/data10/shared/yujie/DCIS/ANALYSIS/concaveHulls


In [None]:
import json
import numpy as np
import pandas as pd

def integrate_range(r_values, val, theo, lower_bound, upper_bound):
    """Integrate the (val - theo) for a specific range."""
    mask = (np.array(r_values) >= lower_bound) & (np.array(r_values) < upper_bound)
    r_filtered = np.array(r_values)[mask]
    val_minus_theo = np.array(val)[mask] - np.array(theo)[mask]
    integration = np.trapz(val_minus_theo, r_filtered)
    return integration

def initialize_columns():
    """Initialize the columns for the DataFrame."""
    columns = ['patient_cluster']  # Add 'patient_cluster' as the first column
    range_list = [(0, 10),(10,20), (20, 30), (30, 40), (40, 50), 
                                             (50, 60), (60, 70), (70, 80), (80, 90),(90, 100)]
    #GFL for all points
    for function in ['G', 'F', 'L']:
        for range_bounds in range_list:
            columns.append(f"{function}_{range_bounds[0]}<=r<{range_bounds[1]}")
    ###GFL for single marker positive points
    stains = ['CA9','Glut1','LAMP2b']
    for stain in stains:
        for function in ['subG', 'subF','subL']:
            for range_bounds in range_list:
                columns.append(f"{stain}_{function}_{range_bounds[0]}<=r<{range_bounds[1]}")
    ###GL cross for 2-marker pair
    mark_pairs = [('CA9', 'Glut1'), ('CA9', 'LAMP2b'), ('Glut1', 'LAMP2b')]
    for i, j in mark_pairs:
        pair_name = f"{i}&{j}"
        for function in ['G_cross', 'L_cross']:
            for range_bounds in range_list:
                columns.append(f"{pair_name}_{function}_{range_bounds[0]}<=r<{range_bounds[1]}")
    
    return columns

def process_json_file(file_path):
    with open(file_path, 'r') as file:
        all_data = json.load(file)

    # Initialize columns
    columns = initialize_columns()
    range_list = [(0, 10),(10,20), (20, 30), (30, 40), (40, 50), 
                                             (50, 60), (60, 70), (70, 80), (80, 90),(90, 100)]
    rows = []

    # Process each key in the JSON data
    for key, data in all_data.items():
        row = {'patient_cluster': key}  # Initialize row with patient_cluster key

        # Initialize all other columns with NA
        for col in columns[1:]:
            row[col] = None

        # Integrate for G, F, L
        for function in ['G', 'F', 'L']:
            for range_bounds in range_list:
                column_name = f"{function}_{range_bounds[0]}<=r<{range_bounds[1]}"
                row[column_name] = integrate_range(
                    data[function]['r'], data[function]['val'], data[function]['theo'], *range_bounds)
        if isinstance(data['sub_GFL'], dict):
            for stain, stain_data in data['sub_GFL'].items():
                for function in ['subG', 'subF','subL']:
                    for range_bounds in range_list:
                        column_name = f"{stain}_{function}_{range_bounds[0]}<=r<{range_bounds[1]}"
                        row[column_name] = integrate_range(
                            stain_data[function]['r'], stain_data[function]['val'], stain_data[function]['theo'], *range_bounds)

        # Integrate for cross functions if it's a dictionary
        if isinstance(data['cross_functions'], dict):
            for pair_name, pair_data in data['cross_functions'].items():
                for function in ['G_cross', 'L_cross']:
                    if function in pair_data:
                        for range_bounds in range_list:
                            column_name = f"{pair_name}_{function}_{range_bounds[0]}<=r<{range_bounds[1]}"
                            row[column_name] = integrate_range(
                                pair_data[function]['r'], pair_data[function]['val'], pair_data[function]['theo'], *range_bounds)

        # Add the row to the list
        rows.append(row)

    # Convert list of rows to DataFrame
    df = pd.DataFrame(rows, columns=columns)
    return df


##remove columns with too many NAs or 0s
def cleanData(df):
    percent_na = (df.isna()).sum() / len(df)
    columns_to_drop_na = percent_na[percent_na > 0.3].index
    percent_zeros = (df == 0).sum() / len(df)
    columns_to_drop_zero = percent_zeros[percent_zeros > 0.5].index
    columns_to_drop = set(columns_to_drop_na).union(set(columns_to_drop_zero))
    cleaned_df = df.drop(columns=list(columns_to_drop))
    return cleaned_df

In [None]:
#####exc44 CL 100 5
file_path = '/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/exc44_CL_100_5/exc44_CL_123_123_all_spatial_functions.json'
df = process_json_file(file_path)
cleaned_df = cleanData(df)
cleaned_df.to_csv('/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/exc44_CL_100_5/spat_integration.csv', index=False)
print(cleaned_df.shape)
print(cleaned_df.head())

#####biopsy CL 100 5
file_path = '/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/biopsy_CL_100_5/biopsy_CL_123_123_all_spatial_functions.json'
df = process_json_file(file_path)
cleaned_df=cleanData(df)
cleaned_df.to_csv('/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/biopsy_CL_100_5/spat_integration.csv', index=False)
print(cleaned_df.shape)
print(cleaned_df.head())

In [None]:
#####C 100 5
#####exc44
file_path = '/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/exc44_C_100_5/all_spatial_functions.json'
df = process_json_file(file_path)
cleaned_df=cleanData(df)
cleaned_df.to_csv('/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/exc44_C_100_5/spat_integration.csv', index=False)
print(cleaned_df.shape)
print(cleaned_df.head())
#####biopsy
file_path = '/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/biopsy_C_100_5/all_spatial_functions.json'
df = process_json_file(file_path)
cleaned_df=cleanData(df)
cleaned_df.to_csv('/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/biopsy_C_100_5/spat_integration.csv', index=False)
print(cleaned_df.shape)
print(cleaned_df.head())

In [None]:
#####L 100 5
#####exc44
file_path = '/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/exc44_L_100_5/all_spatial_functions.json'
df = process_json_file(file_path)
cleaned_df=cleanData(df)
cleaned_df.to_csv('/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/exc44_L_100_5/spat_integration.csv', index=False)
print(cleaned_df.shape)
print(cleaned_df.head())
#####biopsy
file_path = '/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/biopsy_L_100_5/all_spatial_functions.json'
df = process_json_file(file_path)
cleaned_df=cleanData(df)
cleaned_df.to_csv('/mnt/data10/shared/yujie/DCIS/ANALYSIS/R_analysis/biopsy_L_100_5/spat_integration.csv', index=False)
print(cleaned_df.shape)
print(cleaned_df.head())