In [1]:
import sys
sys.path.append("..")
from pathlib import Path
import numpy as np
from scipy import stats
import importlib
import src.assay_calibration.fit_utils.two_sample.fit
from src.assay_calibration.fit_utils.fit import Fit
importlib.reload(src.assay_calibration.fit_utils.two_sample.fit)
importlib.reload(src.assay_calibration.fit_utils.fit)
from src.assay_calibration.fit_utils.two_sample.fit import single_fit
from src.assay_calibration.fit_utils.two_sample import (density_utils,constraints, optimize)
import scipy.stats as sps
import matplotlib
matplotlib.set_loglevel("warning")
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import trange
import os
sys.path.append(str(Path(os.getcwd()).parent))
from src.assay_calibration.data_utils.dataset import (
    PillarProjectDataframe,
    Scoreset,
    BasicScoreset,
)
import json
import glob

In [2]:
def test_fit(ds, component_range, check_monotonic, init_strategy):
    # Load directly from JSON
    fit = Fit(ds)
    fits, _, _ = fit.run(
                core_limit=10,
                num_fits=20,
                verbose_level=20,
                component_range=component_range,
                bootstrap=False,
                verbose=False,
                max_em_iters=10000,
                check_convergence=False,
                check_monotonic=check_monotonic,
                # submerge_steps=None,#256 if check_monotonic else None,
                init_strategy=init_strategy,
                score_min=ds.scores.min() - 1,
                score_max=ds.scores.max() + 1,
                init_constraint_adjustment_param="skew",
                # kmean_init="k-means++"
            )
    
    return fits, ds


In [3]:
all_results = {}

In [4]:
#### import matplotlib.pyplot as plt
import glob
import os
import numpy as np

# Create output directory if it doesn't exist
output_dir = "/data/ross/assay_calibration/test_experimental_plots_v12"
os.makedirs(output_dir, exist_ok=True)

# Suppress matplotlib debug messages
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

n_non_violating_fits_without_constraint = 0
# start = False

dataset_fs = glob.glob("/data/ross/assay_calibration/scoresets/DDX*_missense_replace_gnomad_modified*.json")
# dataset_fs = [f for f in dataset_fs if "_missense_replace_gnomad_modified" not in f]
for dataset_f in (dataset_fs):
    
    dataset_name = dataset_f.split('/')[-1][:-5]
    # if dataset_name == "HMBS_van_Loggerenberg_2023_combined":
    #     start = True
    # if not start:
    #     continue
    
    print(f"Processing {dataset_name}")

    ds = Scoreset.from_json(dataset_f, five_sample=True)
    print(ds)

    sample_names = np.array([sample[1] for sample in ds.samples])
    
    # Create figure with 4x3 subplot grid
    fig, axes = plt.subplots(len(sample_names), 4, figsize=(18, 5*len(sample_names)))
    fig.suptitle(f'Dataset: {dataset_name}', fontsize=16, y=0.995)

    all_results[dataset_name] = {}
    
    plot_idx = 0
    for component_range in ["2-component", "3-component"]:
        for monotonicity_constraint in ["constraint", "no constraint"]:
            # for init_density_fix in ["lambda","sigma"]:
            init_strategy = "random"
            
            # Run the fit
            try:
                fits, ds = test_fit(
                    ds, 
                    component_range=[3] if component_range[0] == "3" else [2], 
                    check_monotonic=False if monotonicity_constraint[0] == "n" else True, 
                    init_strategy=init_strategy
                )
                
                # Get best fit
                # print(fits[0])
                fit_results = sorted(fits, key=lambda res: res['likelihoods'][-1], reverse=True)
                times_submerged = [results['times_submerged'] for results in fit_results]
                print(dataset_name, component_range, monotonicity_constraint, times_submerged)
                best_fit = fit_results[0]
                best_init = "MoM" if best_fit['kmeans'] == "method_of_moments" else "KM"
                xlims = best_fit['xlims']
                scores = ds.scores
                sample_assignments = ds.sample_assignments

                
                all_results[dataset_name][(component_range, monotonicity_constraint)] = fit_results 
    
                # Calculate densities
                score_range = np.linspace(scores.min(), scores.max(), 1000)
                estimatedDensities = np.array([
                    density_utils.joint_densities(
                        score_range[..., None],
                        best_fit['component_params'],
                        sample_weights
                    ).squeeze() for sample_weights in best_fit['weights']
                ])
                
                # Check if density constraint violated
                fit_violates_constraint = constraints.multicomponent_density_constraint_violated(best_fit['component_params'], xlims)
                if monotonicity_constraint == "no constraint" and fit_violates_constraint:
                    n_non_violating_fits_without_constraint += 1
                fit_violates_constraint = "violates" if fit_violates_constraint else "not violates"

                sample_lls = density_utils.get_sample_likelihood(scores, sample_assignments, best_fit['component_params'], best_fit['weights']) / np.array([sum(each_sample_assignments) for each_sample_assignments in sample_assignments.T])
                
                # Plot for each sample (3 columns)
                for i in range(len(estimatedDensities)):
                    ax = axes[i, plot_idx]
    
                    
                    # Plot estimated density
                    ax.plot(score_range, estimatedDensities[i].sum(0), 
                           label='Estimated', color='C1', linestyle='-', linewidth=2)
                    
                    # Plot histogram of actual data
                    max_hist_height = 1.0
                    if i < sample_assignments.shape[1] and sample_assignments[:, i].sum() > 0:
                        sample_data = scores[sample_assignments[:, i]]
                        counts, bins, patches = ax.hist(sample_data, 
                                                        bins=30, density=True, alpha=0.3, 
                                                        color='gray', label='Data')
                        
                        # Get max height of histogram
                        max_hist_height = counts.max()
                        
                        # Set ylim to 1.2x the histogram max to avoid components with 1e99 density and no scale
                        ax.set_ylim(0, max_hist_height * 1.2)
                    
                    # Set labels and title
                    if plot_idx == 0:
                        ax.set_ylabel(sample_names[i], 
                                     fontsize=12, fontweight='bold')
                    
                    if i == 0:
                        ax.set_title(f'{component_range}, {monotonicity_constraint}, {best_init}, {fit_violates_constraint}', fontsize=12)
                    
                    if i == len(estimatedDensities) - 1:
                        ax.set_xlabel('Score', fontsize=10)
                    
                    ax.legend(loc='upper right', fontsize=8)
                    ax.grid(True, alpha=0.2)
                    
                    # Add likelihood value as text
                    likelihood = best_fit['likelihoods'][-1]
                    ax.text(0.02, 0.98, f'SLL: {sample_lls[i]:.3f}, LL: {likelihood:.3f}, n={len(scores[sample_assignments[:, i]])}', 
                           transform=ax.transAxes, fontsize=12,
                           verticalalignment='top')
                
            except Exception as e:
                print(f"  Error with {component_range}, {monotonicity_constraint}: {e}")
                # Create empty plots with error message
                for i in range(len(estimatedDensities)):
                    ax = axes[i, plot_idx]
                    ax.text(0.5, 0.5, f'Error:\n{str(e)[:30]}...', 
                           ha='center', va='center', transform=ax.transAxes)
                    ax.set_xticks([])
                    ax.set_yticks([])

                    if i == 0:
                        ax.set_title(f'{component_range}, {monotonicity_constraint}, {best_init}, {fit_violates_constraint}', fontsize=12)
                    
                    if plot_idx == 0:
                        ax.set_ylabel(sample_names[i], 
                                     fontsize=12, fontweight='bold')
                    
                    if i == len(estimatedDensities) - 1:
                        ax.set_xlabel('Score', fontsize=10)
            
            plot_idx += 1
    
    # Adjust layout and save
    plt.tight_layout(rect=[0, 0, 1, 0.99])
    
    output_path = f"{output_dir}/{dataset_name}.png"
    plt.savefig(output_path, dpi=100, bbox_inches='tight')
    plt.close()
    
    print(f"  Saved to {output_path}")

print("All plots saved!")

Processing DDX3X_Radford_2023_cLFC_day15
DDX3X_Radford_2023_cLFC_day15: 9080 total variants
	Pathogenic/Likely Pathogenic: 201 variants
	Benign/Likely Benign: 88 variants
	gnomAD: 524 variants
	Synonymous: 1245 variants
	All Missense SNV: 4380 variants

DDX3X_Radford_2023_cLFC_day15 2-component constraint [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
DDX3X_Radford_2023_cLFC_day15 2-component no constraint [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
DDX3X_Radford_2023_cLFC_day15 3-component constraint [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
DDX3X_Radford_2023_cLFC_day15 3-component no constraint [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
  Saved to /data/ross/assay_calibration/test_experimental_plots_5sample/DDX3X_Radford_2023_cLFC_day15.png
All plots saved!
