In [26]:
import pandas as pd
import numpy as np

df = pd.read_table('/data2/deepLN/table.kor_sfari_mssng.DNV_annotated.coding_combinations.20250226.tsv.gz')
df.rename(columns=lambda x: x.replace("is_", "") if "is_" in x else x, inplace=True)

# Define column groups
celltype_cols = ['MGE.dev', 'CGE.dev', 'AST', 'L2.3', 'L4', 'L5', 'L5.6.IT', 'L6', 'MG', 'OL', 'END', 'PER', 'SP']
asd_gene_cols = ['DDD285', 'CHD8Common', 'FMRPDarnell', 'ASD185']
varianttype_cols_coding_1 = ['PTV', 'modMIS', 'dMIS']  # coding == 1에서만 카운트
varianttype_cols_coding_0 = ['promoter', 'UTR']  # coding == 0에서만 카운트

In [27]:
# Function to generate combinations
def generate_combinations(row):
    A_matches = [col for col in celltype_cols if row[col] == 1] or ["None"]
    B_matches = [col for col in asd_gene_cols if row[col] == 1] or ["None"]
    
    if row['coding'] == 1:
        C_matches = [col for col in varianttype_cols_coding_1 if row[col] == 1] or ["None"]
    else:
        C_matches = [col for col in varianttype_cols_coding_0 if row[col] == 1] or ["None"]
    
    combinations = [f"{A}_{B}_{C}" for A in A_matches for B in B_matches for C in C_matches]
    return combinations

# Apply function
df['combinations'] = df.apply(generate_combinations, axis=1)
all_combinations = set(comb for row_combs in df['combinations'] for comb in row_combs)

In [28]:
# Create new binary columns
temp_df = pd.DataFrame(0, index=df.index, columns=list(all_combinations))
for idx, row in df.iterrows():
    for comb in row['combinations']:
        temp_df.at[idx, comb] = 1

df = pd.concat([df, temp_df], axis=1)
df = df.drop(columns=['combinations'])
columns_to_remove = celltype_cols + asd_gene_cols + varianttype_cols_coding_1 + varianttype_cols_coding_0 + ['gene_id', 'gene_name']
df = df.drop(columns=columns_to_remove)

df.to_csv("/data2/deepLN/kor_sfari_mssng.DNV_annotated.coding_combinations.agg.20250227.tsv.gz", sep="\t", index=False, compression="gzip")

In [29]:
counts_by_sample_coding_1 = df[df['coding'] == 1].drop(columns=['variant', 'coding']).groupby('SAMPLE').sum()
counts_by_sample_coding_1 = counts_by_sample_coding_1.add_prefix('cd_')
proportions_coding_1 = df[df['coding'] == 1].drop(columns=['variant', 'coding']).groupby('SAMPLE').apply(lambda x: (x == 1).mean())
proportions_coding_1 = proportions_coding_1.add_prefix('prop_cd_')

counts_by_sample_coding_0 = df[df['coding'] == 0].drop(columns=['variant', 'coding']).groupby('SAMPLE').sum()
counts_by_sample_coding_0 = counts_by_sample_coding_0.add_prefix('nc_')
proportions_coding_0 = df[df['coding'] == 0].drop(columns=['variant', 'coding']).groupby('SAMPLE').apply(lambda x: (x == 1).mean())
proportions_coding_0 = proportions_coding_0.add_prefix('prop_nc_')

counts_combined = pd.concat([counts_by_sample_coding_1, counts_by_sample_coding_0], axis=1)
counts_combined_with_proportions = pd.concat([counts_combined, proportions_coding_1, proportions_coding_0], axis=1)

# Remove unwanted columns
for col in list(counts_combined_with_proportions.columns):
    if any(f"nc_" in col and vt in col for vt in varianttype_cols_coding_1) or \
       any(f"cd_" in col and vt in col for vt in varianttype_cols_coding_0):
        counts_combined_with_proportions = counts_combined_with_proportions.drop(columns=[col])

counts_combined_with_proportions = counts_combined_with_proportions.drop(columns=[col for col in counts_combined_with_proportions.columns if 'SAMPLE' in col])
counts_combined_with_proportions = counts_combined_with_proportions.reset_index().fillna(0)

  proportions_coding_1 = df[df['coding'] == 1].drop(columns=['variant', 'coding']).groupby('SAMPLE').apply(lambda x: (x == 1).mean())
  proportions_coding_0 = df[df['coding'] == 0].drop(columns=['variant', 'coding']).groupby('SAMPLE').apply(lambda x: (x == 1).mean())


In [30]:
# PTV가 포함된 컬럼 출력
ptv_columns = [col for col in counts_combined_with_proportions.columns if "promoter" in col]
print(ptv_columns)


['nc_SP_DDD285_promoter', 'nc_END_FMRPDarnell_promoter', 'nc_None_DDD285_promoter', 'nc_MGE.dev_ASD185_promoter', 'nc_CGE.dev_DDD285_promoter', 'nc_None_None_promoter', 'nc_None_FMRPDarnell_promoter', 'nc_PER_None_promoter', 'nc_AST_ASD185_promoter', 'nc_L2.3_FMRPDarnell_promoter', 'nc_MG_None_promoter', 'nc_CGE.dev_None_promoter', 'nc_SP_CHD8Common_promoter', 'nc_MGE.dev_CHD8Common_promoter', 'nc_L4_FMRPDarnell_promoter', 'nc_OL_None_promoter', 'nc_SP_None_promoter', 'nc_AST_CHD8Common_promoter', 'nc_OL_CHD8Common_promoter', 'nc_None_ASD185_promoter', 'nc_L4_ASD185_promoter', 'nc_AST_FMRPDarnell_promoter', 'nc_END_CHD8Common_promoter', 'nc_L2.3_None_promoter', 'nc_L4_None_promoter', 'nc_AST_DDD285_promoter', 'nc_L5_FMRPDarnell_promoter', 'nc_MGE.dev_DDD285_promoter', 'nc_MGE.dev_None_promoter', 'nc_CGE.dev_FMRPDarnell_promoter', 'nc_L5.6.IT_FMRPDarnell_promoter', 'nc_MG_ASD185_promoter', 'nc_L4_CHD8Common_promoter', 'nc_AST_None_promoter', 'nc_MGE.dev_FMRPDarnell_promoter', 'nc_MG_FMR

In [31]:
counts_combined_with_proportions.to_csv("/data2/deepLN/kor_sfari_mssng.coding_combinations.agg_by_sample.20250307.tsv.gz", sep="\t", index=False, compression="gzip")