Below compares patients to control PBMCs to define gapfills that are likely variants.
It also includes known (pre-defined) variants to establish feature sets for downstream.
This is pretty specific to this experiment. 

In [1]:
import pandas as pd
import numpy as np
import os
import scipy.stats as st
import matplotlib.pyplot as plt
from gapfill_utilities import utils as gf_utils
from statsmodels.stats.proportion import proportions_ztest


In [2]:
lib = '2'


In [3]:
directory = 'unexpected_gapfill_likelihoods'

# Read all tables in the directory
i = 0
for file in os.listdir(directory):
    if file.endswith('.csv'):  # Assuming the tables are in CSV format
        table_name = os.path.splitext(file)[0].split('gapfills_')[1]
        print(table_name)
        current_table = pd.read_csv(os.path.join(directory, file))
        columns_to_rename = [col for col in current_table.columns if col not in ['gapfill', 'gapfill_from_transcriptome','likelihood','name']]
        current_table.rename(columns={col: col + '_' + table_name for col in columns_to_rename}, inplace=True)
        if i == 0:
            merged_table = current_table
        else:
            merged_table = merged_table.merge(
            current_table,
            on=['name','gapfill','gapfill_from_transcriptome','likelihood'],
            how='outer'
            )
        if len(merged_table.loc[merged_table['name'].isna()]) >0:
            break
        i += 1

## restrict to a minium number of counts to be considered
merged_table.loc[:,merged_table.columns.str.contains('count_of_this_gapfill')] = merged_table.loc[:,merged_table.columns.str.contains('count_of_this_gapfill')].fillna(0)
merged_table.loc[:,merged_table.columns.str.contains('frequency')] = merged_table.loc[:,merged_table.columns.str.contains('frequency')].fillna(0)

merged_table = merged_table.loc[:,~merged_table.columns.str.contains('probe_idx')]


BC009
BC004
BC001
BC006
BC011
BC013
BC010
BC008
BC007
BC015
BC016
BC012
BC014
BC005
BC003
BC002


In [4]:
### add patient id to probe_reads
patient_key = pd.read_csv('/data1/lareauc/projects/gapfill/analysis/20250605_MPN_16plex/MPN_patient_coverage_by_panel.csv')

# Drop rows where either '16-plex 1' or 'patient_id' is missing
mapping_df = patient_key[['16-plex ' + lib, 'patient_id']].dropna()
mapping_df.rename(columns={'16-plex ' + lib: 'BC'}, inplace=True)

# Extract integer from '16-plex 1' (e.g., 'BC003' -> 3)
mapping_df['bc_int'] = mapping_df['BC'].str.split('BC0').str[1].astype(int)

control_idx = mapping_df.loc[mapping_df['patient_id'] == 'control PBMC','bc_int'].values[0]
control_freq_column = 'frequency_BC0' + str(control_idx)
control_gapfill_count_column = 'count_of_this_gapfill_BC0' + str(control_idx)
control_probe_count_column = 'count_of_this_probe_BC0' + str(control_idx)


In [5]:
### now add probe counts for gapfills that were not present in that sample
for BC in mapping_df['BC']:
    gapfill_col = f'count_of_this_gapfill_{BC}'
    probe_col = f'count_of_this_probe_{BC}'
    if gapfill_col in merged_table.columns and probe_col in merged_table.columns:
        summed = merged_table.groupby('name')[gapfill_col].transform('sum')
        merged_table[probe_col] = merged_table[probe_col].fillna(summed)

In [6]:
for BC in mapping_df['BC']:
    col_name = f'likelihood_wt_given_observed_{BC}'
    results = []
    for idx, row in merged_table.iterrows():
        n_gapfill = row.get(f'count_of_this_gapfill_{BC}', np.nan)
        n_probe = row.get(f'count_of_this_probe_{BC}', np.nan)
        n_gapfill_control = row.get(control_gapfill_count_column, np.nan)
        n_probe_control = row.get(control_probe_count_column, np.nan)
        if pd.isna(n_probe_control) or (n_probe_control == 0) or pd.isna(n_gapfill) or (n_gapfill < 10):
            results.append(np.nan)
        else:
            # print(f"BC: {BC}, n_gapfill: {n_gapfill}, n_probe: {n_probe}, n_gapfill_control: {n_gapfill_control}, n_probe_control: {n_probe_control}")
            count = [n_gapfill, n_gapfill_control]
            nobs = [n_probe, n_probe_control]
            try:
                z_stat, p_val = proportions_ztest(count, nobs)
                results.append(p_val)
            except Exception:
                results.append(np.nan)
    merged_table[col_name] = results


  zstat = value / std


  zstat = value / std


  zstat = value / std
  zstat = value / std


  zstat = value / std
  zstat = value / std


  zstat = value / std


In [7]:
p_threshold = 1e-30 ### very stringent for now

feature_set = {}
for BC in mapping_df['BC']:
    feature_set[BC] = merged_table.loc[(merged_table['likelihood_wt_given_observed_' + BC] < p_threshold)][['name', 'gapfill', 'gapfill_from_transcriptome', 'frequency_' + BC, 'count_of_this_gapfill_' + BC, 'likelihood_wt_given_observed_' + BC]]
    feature_set[BC] = feature_set[BC].loc[feature_set[BC]['gapfill'] != feature_set[BC]['gapfill_from_transcriptome']]
    ### for now only allow 1 alt per site. take the one least likely to be by chance
    feature_set[BC] = feature_set[BC].loc[feature_set[BC].groupby('name')['likelihood_wt_given_observed_' + BC].idxmin()]
    feature_set[BC].to_csv('likelihood_tables/' + BC + '_alt_gapfill_set.csv',index=False)


In [8]:
dfs = []
for bc in feature_set:
    df_tmp = feature_set[bc].copy()
    df_tmp.columns = df_tmp.columns.str.replace('_' + bc, '')
    df_tmp['BC'] = bc
    dfs.append(df_tmp)
all_features_df = pd.concat(dfs, ignore_index=True)
all_features_df.to_csv('likelihood_tables/all_gapfill_features.csv', index=False)
all_features_df.groupby(['name','gapfill','gapfill_from_transcriptome']).size().sort_values(ascending=False)


##### save a version with just the unique gapfills - THIS NEEDS TO BE FIXED LATER
all_features_df.sort_values('likelihood_wt_given_observed').drop_duplicates(subset='name').to_csv('likelihood_tables/all_gapfill_features_filtered.csv', index=False)

In [9]:
BC = 'BC016'
feature_set[BC].sort_values('frequency_' + BC)

Unnamed: 0,name,gapfill,gapfill_from_transcriptome,frequency_BC016,count_of_this_gapfill_BC016,likelihood_wt_given_observed_BC016
103,ASXL1 c.1934dupG,CCCC,CCC,0.165103,88.0,3.6376119999999997e-38
2668,JAK2 c.1849G>T,AAC,CAC,0.964167,13884.0,0.0


In [10]:
patient_id = mapping_df.loc[mapping_df['BC'] == BC, 'patient_id'].values[0]
patient_metadata = pd.read_csv('/home/blattms1/projects/gapfill/patient_JAK2/20250605_16plex_MPN/patient_metadata.csv', header = 1)
patient_metadata['patient_id'] = patient_metadata['patient id'].astype(str)
patient_metadata.loc[patient_metadata['patient_id'] == patient_id]

Unnamed: 0,Date of Blood Sample,CD34+ Cell number,Viability,Driver mutation,Mutation 2,Mutation 3,Mutation 4,Mutation 5,Mutation 6,Mutation 7,Comment,Molecular evaluation,patient id,patient_id
10,2/18/22,5x105,>80%,c.1849G>T ; JAK2V617F 95%,ASXL1 c.1934dupG p.G646Wfs*12 42.2%,EZH2 c.2064T>A p.(N688K) 43.6%,NRAS c.35G>A p.(G12D) 1.3%,,,,Thawed,9/28/22,10,10
