In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

# Pre-Processing

- Baseline correction: I subtracted the baseline (no protein) intensity from the 81nM condition to isolate the change in signal attributed to binding.

- Spatial mapping: I converted Bead_IDs into (row, column) coordinates based on a 200×200 grid and accounted for right to left row-major ordering.

- Spatial background correction: I used a K-nearest neighbors approach within each FOV to estimate local background levels by computing the median intensity of nearby beads, then subtracting this value from each bead’s intensity.


In [None]:
df['molecule'] = df['BB1'].astype(str) + "_" + df['BB2'].astype(str) + "_" + df['BB3'].astype(str)
num_unique_molecules = df['molecule'].nunique()
print(f"Unique molecules: {num_unique_molecules}")


In [None]:
bb1_set = set(df["BB1"])
bb2_set = set(df["BB2"])

# Intersection
overlap = bb1_set & bb2_set

print(f"BB1 values also found in BB2: {sorted(overlap)}")


In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# intensity delta difference
df['intensity'] = df['81nM'] - df['BL']

#Bead_ID to (row, col) in right-to-left in row-major order
df['row'] = (df['Bead_ID'] - 1) // 200
df['col'] = 199 - ((df['Bead_ID'] - 1) % 200)

# smoothing function for nearest neighbors
def smooth_by_knn(group, k=8):
    coords = group[['row', 'col']].values
    intensities = group['intensity'].values
    n_points = len(coords)
    
    if n_points <= 1:
        #edge case
        return pd.Series(intensities, index=group.index)
    
    n_neighbors = min(k + 1, n_points)  # Include self in query
    nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(coords)
    _, indices = nbrs.kneighbors(coords)

    #subtract median of neighbors (excluding self)
    local_bg = np.array([
        np.median(intensities[neigh[1:]]) if len(neigh) > 1 else 0
        for neigh in indices
    ])
    corrected = np.maximum(intensities - local_bg, 0)# prevents negatives by clipping at zero
    return pd.Series(corrected, index=group.index)

#spatial smoothing per FOV
df['spatially_corrected'] = df.groupby('FOV', group_keys=False).apply(smooth_by_knn)

df['final_intensity'] = df['spatially_corrected']
print(df['final_intensity'].median())


In [None]:
# number of beads per molecule
bead_counts = df.groupby(['BB1', 'BB2', 'BB3']).size().reset_index(name='num_beads')
print(len(bead_counts))
#flter for those with fewer than 5
few1 = bead_counts[bead_counts['num_beads'] < 6]
few2 = bead_counts[bead_counts['num_beads'] < 8]

few3 = bead_counts[bead_counts['num_beads'] < 10]
few4 = bead_counts[bead_counts['num_beads'] < 15]
few1 = bead_counts[bead_counts['num_beads'] < 5]

print(f"Number of molecules with < 5 beads: {len(few1)}")
print(f"Number of molecules with < 6 beads: {len(few1)}")
print(f"Number of molecules with < 8 beads: {len(few2)}")
print(f"Number of molecules with < 10 beads: {len(few3)}")
print(f"Number of molecules with < 15 beads: {len(few4)}")

#plot distribution of histogram of num counts of beads

# print(few_beads.head())


In [None]:
# Count number of beads per unique molecule
bead_counts = df.groupby(['BB1', 'BB2', 'BB3']).size().reset_index(name='num_beads')
total = len(bead_counts)
print(total)
# Define replicate bins and print counts + percentages
thresholds = [5, 6, 8, 10, 15]
for t in thresholds:
    count = (bead_counts['num_beads'] < t).sum()
    percent = 100 * count / total
    print(f"Number of molecules with < {t} beads: {count:,} ({percent:.2f}%)")


In [None]:
import matplotlib.pyplot as plt

bead_counts = df.groupby(['BB1', 'BB2', 'BB3']).size().reset_index(name='num_beads')
print(f"Total unique molecules: {len(bead_counts)}")
print(f"Number of molecules with< 6 beads: {len(bead_counts[bead_counts['num_beads'] < 6])}")
print(f"Number of molecules with < 8 beads: {len(bead_counts[bead_counts['num_beads'] < 8])}")
print(f"Number of molecules with < 10 beads: {len(bead_counts[bead_counts['num_beads'] < 10])}")
print(f"Number of molecules with < 15 beads: {len(bead_counts[bead_counts['num_beads'] < 15])}")

plt.figure(figsize=(10, 6))
plt.hist(bead_counts['num_beads'], bins=range(1, bead_counts['num_beads'].max() + 2), edgecolor='black', alpha=0.75)
plt.axvline(x=5, color='red', linestyle='--', linewidth=2, label='5-bead cutoff')
max_beads = bead_counts['num_beads'].max()
plt.xticks(ticks=range(0, max_beads + 5, 5))

plt.title('Distribution of Bead Counts per Molecule')
plt.xlabel('Number of Beads')
plt.ylabel('Number of Molecules')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


# Hit Calling
To identify molecular binders with statistical confidence, I restricted analysis to molecules with five or more bead replicates, ensuring sufficient power for hypothesis testing. As shown in the histogram above, the distribution of bead counts is right-skewed, with few molecules represented by more than 10 beads. Setting a five-bead cutoff provides a conservative filter that reduces the risk of false positives due to undersampling. For these qualifying molecules, I applied the Wilcoxon signed-rank test to evaluate whether their spatially corrected final_intensity values (81nM − BL) were significantly greater than the global median. This test is suited for skewed data and does not assume a normal distribution of values.

Given the scale of the dataset and the large number of statistical tests performed, I applied the Benjamini-Hochberg procedure to control the false discovery rate (FDR) and improve the reliability of detected hits. However, because standard thresholds such as FDR < 0.05 yielded no hits despite evidence of  signal, I used a more permissive threshold of FDR < 0.24. This choice reflects that this screen is exploratory and a high througput assay, where some tolerance for false positives is acceptable in exchange for increased sensitivity. I understand that this is normally a frowned upon FDR value, and I would investigate other methods of multiple testing correction if I had more time.

In [None]:
from scipy.stats import wilcoxon


threshold = df['final_intensity'].median()

def wilcoxon_molecule(group):
    values = group['final_intensity'].values
    differences = values - threshold
    if np.all(differences == 0) or len(differences[differences != 0]) < 5:
        return pd.Series({
            'mean_signal': np.mean(values),
            'std_signal': np.std(values),
            'num_beads': len(values),
            'pval': np.nan,
            'source': 'wilcoxon'
        })

    try:
        stat, pval = wilcoxon(differences, alternative='greater')
    except ValueError:
        pval = np.nan 

    return pd.Series({
        'mean_signal': np.mean(values),
        'std_signal': np.std(values),
        'num_beads': len(values),
        'pval': pval,
        'source': 'wilcoxon'
    })


In [None]:
# i also tried doing a t-test, but the distribution is not normal so this was thrown out.

def t_test_molecule(group):
    values = group['final_intensity'].values
    stat, pval = ttest_1samp(values, popmean=threshold, alternative='greater') 
    return pd.Series({
        'mean_signal': np.mean(values),
        'std_signal': np.std(values),
        'num_beads': len(values),
        'pval': pval,
        'source': 't-test'
    })

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt

rep_counts = df.groupby(['BB1', 'BB2', 'BB3']).size().reset_index(name='count')
df = df.merge(rep_counts, on=['BB1', 'BB2', 'BB3'], how='left')

#Filter for molecules with ≥ 5 beads ---
df_filtered = df[df['count'] >= 5].copy()

#baseline for Wilcoxon test ---
threshold = df_filtered['final_intensity'].median()
print(f"Threshold (median final intensity): {threshold:.3f}")

def wilcoxon_molecule(group):
    values = group['final_intensity'].values
    differences = values - threshold

    if np.all(differences == 0) or len(differences[differences != 0]) < 5:
        return pd.Series({
            'mean_signal': np.mean(values),
            'std_signal': np.std(values),
            'num_beads': len(values),
            'pval': np.nan,
            'source': 'wilcoxon'
        })

    try:
        stat, pval = wilcoxon(differences, alternative='greater')
    except ValueError:
        pval = np.nan

    return pd.Series({
        'mean_signal': np.mean(values),
        'std_signal': np.std(values),
        'num_beads': len(values),
        'pval': pval,
        'source': 'wilcoxon'
    })

summary = df_filtered.groupby(['BB1', 'BB2', 'BB3']).apply(wilcoxon_molecule).reset_index()

#drop NaNs and apply FDR correction ---
ssummary = summary.dropna(subset=['pval']).copy()
ssummary['fdr'] = multipletests(ssummary['pval'], method='fdr_bh')[1]
ssummary['is_hit'] = ssummary['fdr'] < 0.24

print(f"Number of molecules with ≥5 beads: {len(ssummary):,}")
print(f"Number of hits (FDR < 0.24): {ssummary['is_hit'].sum():,}")

top_hits_wilcoxon_hits = ssummary[ssummary['is_hit']].sort_values(by='mean_signal', ascending=False).head(100)
print("\nTop 100 hits:")
print(top_hits_wilcoxon_hits[['BB1', 'BB2', 'BB3', 'mean_signal', 'std_signal', 'num_beads', 'fdr']])

top_hits_wilcoxon_hits.to_csv("top_hits_wilcoxon.csv", index=False)


In [None]:
import matplotlib.pyplot as plt

plt.hist(ssummary['pval'].dropna(), bins=50)
plt.title("Distribution of raw p-values")
plt.xlabel("p-value")
plt.ylabel("Number of molecules")
plt.show()
ssummary['is_hit'] = ssummary['fdr'] < 0.24
print(f"FDR < 0.24 hits: {(ssummary['fdr'] < 0.24).sum()}")


ssummary['is_hit'] = ssummary['fdr'] < 0.23
print(f"FDR < 0.23 hits: {(ssummary['fdr'] < 0.23).sum()}")


In [None]:
# top_hits = ssummary[ssummary['is_hit']].sort_values(by='mean_signal', ascending=False).head(10)
# top_hits['molecule'] = top_hits['BB1'] + '_' + top_hits['BB2'] + '_' + top_hits['BB3']
# plt.figure(figsize=(10, 6))
# sns.barplot(data=top_hits, x='mean_signal', y='molecule', palette='rocket')

# plt.xlabel('Mean Final Intensity')
# plt.ylabel('Top Molecules (BB1_BB2_BB3)')
# plt.title('Top 10 Hit Molecules by Mean Intensity (Wilcoxon FDR < 0.23)')
# plt.tight_layout()
# plt.show()
import matplotlib.pyplot as plt
import seaborn as sns

top_hits_wilcoxon_hits['molecule'] = (
    top_hits_wilcoxon_hits['BB1'] + '_' +
    top_hits_wilcoxon_hits['BB2'] + '_' +
    top_hits_wilcoxon_hits['BB3']
)

#group by molecule and count number of beads
hit_counts = top_hits_wilcoxon_hits.groupby('molecule')['num_beads'].sum().reset_index()

# Get top 20 most frequent hit molecules
top_hits = hit_counts.sort_values(by='num_beads', ascending=False).head(20)
plt.figure(figsize=(10, 6))
sns.barplot(data=top_hits, x='num_beads', y='molecule', palette='rocket')

plt.xlabel('Number of Beads Called as Hits')
plt.ylabel('Top Molecules (BB1_BB2_BB3)')
plt.title('Top 10 Hit Molecules by Frequency (Wilcoxon FDR < 0.24)')
plt.tight_layout()
plt.show()


## Second Method: Log Fold Change To Identify Hits

Since the above results were not that satisfying, I decided to try another method. Since the distribution is very right skewed, I used the 95th percentile as the control delta (the denominator in fold change calculation) to provide a more conservative threshold. I am assuming that most molecules will not be hits. I also filtered this dataframe to only have molecules with 5 or more replicates.

I selected a log2 fold change cutoff of 4, corresponding to a 16-fold increase in intensity relative to the 95th percentile baseline, to conservatively define hits with strong enrichment. Given the right-skewed distribution of intensity values, lower thresholds such as log2 fold change > 2 (4-fold increase) resulted in a large number of hits, many of which likely reflected background variation or weak binding. By choosing a higher cutoff, I aimed to prioritize molecules with substantially elevated signal, thereby improving specificity and reducing the risk of false positives. This threshold was chosen heuristically based on observed data distribution. However, further refinement of this cutoff is needed.

In [None]:
# #only molecules that appeared more than 5 times
# print(df.head())
# rep_counts = df.groupby(['BB1', 'BB2', 'BB3']).size().reset_index(name='count')
# df = df.merge(rep_counts, on=['BB1', 'BB2', 'BB3'], how='left')

# #Filter for molecules with ≥ 5 beads ---
# df_filtered = df[df['count'] >= 5].copy()
print(df_filtered.head())

baseline = df_filtered['final_intensity'].quantile(0.95)
print(f"95th percentile baseline: {baseline}")
df_filtered['fold_change'] = df_filtered['final_intensity'] / baseline
df_filtered['log2_fold_change'] = np.log2(df_filtered['fold_change'])


df_filtered['is_hit'] = df_filtered['log2_fold_change'] > 4  # FC > 16


top_hits_logfold_change = df_filtered[df_filtered['is_hit']].sort_values(by='log2_fold_change', ascending=False)
print(top_hits_logfold_change[['BB1', 'BB2', 'BB3', 'final_intensity', 'log2_fold_change']].head(10))
print(f"Number of hits (log2FC > 4, with ≥ 5 replicates): {len(top_hits_logfold_change):,}")
top_hits_logfold_change.to_csv("top_hits_logfold_change_filtered.csv", index=False)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

top_hits_logfold_change['molecule'] = (
    top_hits_logfold_change['BB1'] + '_' +
    top_hits_logfold_change['BB2'] + '_' +
    top_hits_logfold_change['BB3']
)

hit_counts = top_hits_logfold_change.groupby('molecule').size().reset_index(name='num_hits')
len(hit_counts)
top_molecules = hit_counts.sort_values('num_hits', ascending=False).head(20)

plt.figure(figsize=(10, 5))
sns.barplot(data=top_molecules, x='num_hits', y='molecule', orient='h')
plt.xlabel('Number of Beads Called as Hits')
plt.ylabel('Top Molecules')
plt.title('Top Hit Molecules by Bead Count (Log2 Fold Change > 4, ≥5 Replicates)')
plt.tight_layout()
plt.show()
unique_molecules = df_filtered.groupby(['BB1', 'BB2', 'BB3']).ngroups
print(f"Total unique molecules (≥ 5 replicates): {unique_molecules:,}")

# Number of unique hit molecules
hit_counts = top_hits_logfold_change.groupby(['BB1', 'BB2', 'BB3']).ngroups
print(f"Unique hit molecules: {unique_hit_molecules:,}")
#top_hits_2 = top_hits

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create molecule identifier
df_filtered['molecule'] = (
    df_filtered['BB1'] + '_' + df_filtered['BB2'] + '_' + df_filtered['BB3']
)

# Group by molecule: count total beads and number of hits
molecule_counts = df_filtered.groupby('molecule').agg(
    total=('final_intensity', 'count'),
    hits=('is_hit', 'sum')
).reset_index()
molecule_counts['non_hits'] = molecule_counts['total'] - molecule_counts['hits']

# Get top 20 molecules by number of hits
top_molecules = molecule_counts.sort_values(by='hits', ascending=False).head(20)

# Plot stacked horizontal bar chart
plt.figure(figsize=(10, 6))
plt.barh(top_molecules['molecule'], top_molecules['non_hits'], label='Non-Hits', color='lightgray')
plt.barh(top_molecules['molecule'], top_molecules['hits'], left=top_molecules['non_hits'], label='Hits', color='steelblue')
plt.xlabel("Total Beads per Molecule")
plt.ylabel("Top Molecules")
plt.title("Hits vs. Non-Hits per Molecule (Top 20)")
plt.legend()
plt.tight_layout()
plt.savefig("top20_hits_vs_nonhits_per_molecule.png", dpi=300)

plt.show()


In [None]:
sns.histplot(df_filtered['log2_fold_change'], bins=50, kde=True)
plt.axvline(4, color='red', linestyle='--', label='Hit Threshold (log2FC > 4)')
plt.title("Distribution of Log2 Fold Change")


In [None]:
sns.histplot(df_filtered['log2_fold_change'], bins=50, kde=True)
plt.axvline(4, color='red', linestyle='--', label='Hit Threshold (log2FC > 4)')
plt.title("Distribution of Log2 Fold Change")


In [None]:
top_bb3 = df_filtered[df_filtered['is_hit']].groupby('BB3').size().sort_values(ascending=False).head(20)
top_bb3.plot(kind='barh')
plt.title("Top BB3s Among Hits")


In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count number of hits per BB3
bb3_hit_counts = top_hits_wilcoxon_hits['BB3'].value_counts().reset_index()
bb3_hit_counts.columns = ['BB3', 'num_hits']

# Take top 20 most frequent BB3s in hits
top_bb3s = bb3_hit_counts.head(20)

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(data=top_bb3s, x='num_hits', y='BB3', orient='h')
plt.xlabel('Number of Hits')
plt.ylabel('BB3')
plt.title('Top BB3s Among Wilcoxon-Identified Hits')
plt.tight_layout()
plt.show()


# Prominent BB3's for method 1 wilcoxon statistical test

Enrichment BB3= proportion of hits with BB3/proportion of all molecules with BB3


In [None]:
# prominent BB3's for method 1 wilcoxon test

all_bb3_counts = df_filtered['BB3'].value_counts(normalize=True)
hit_bb3_freqs = top_hits_wilcoxon_hits['BB3'].value_counts(normalize=True)

bb3_enrichment = (hit_bb3_freqs / all_bb3_counts).dropna().sort_values(ascending=False)
print(bb3_enrichment.head(10))


all_bb1_counts = df['BB1'].value_counts(normalize=True)
hit_bb1_freqs = top_hits_wilcoxon_hits['BB1'].value_counts()

bb1_enrichment = (hit_bb1_freqs / all_bb1_counts).dropna().sort_values(ascending=False)
print(bb1_enrichment.head(10))


In [None]:
print(len(hit_bb3_freqs))  # How many BB3s were found in hits
print(len(all_bb3_counts))  # How 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

top_bb3_enrichment = bb3_enrichment.head(20)

plt.figure(figsize=(10, 6))
sns.barplot(
    x=top_bb3_enrichment.values,
    y=top_bb3_enrichment.index,
    palette='viridis'
)

plt.xlabel('Enrichment (Hit Freq / Background Freq)')
plt.ylabel('BB3 Component')
plt.title('Top 20 Enriched BB3 Components (wilcoxon)')
plt.tight_layout()
plt.show()


In [None]:
print(top_bb3_enrichment.head(3))


## Discussion

Only three different BB3's appeared in my top hits that I identified from the wilcoxon test. This is not what I expected. The actual enrichment values themselves are also very very small, so I dont think this is the most reliable.

# Prominent BB3's for method 2 log fold change 


In [None]:

all_bb3_counts = df_filtered['BB3'].value_counts(normalize=True)
hit_bb3_freqs = top_hits_logfold_change['BB3'].value_counts(normalize=True)

bb3_enrichment = (hit_bb3_freqs / all_bb3_counts).dropna().sort_values(ascending=False)
print(bb3_enrichment.head(10))


all_bb1_counts = df['BB1'].value_counts(normalize=True)
hit_bb1_freqs = top_hits_logfold_change['BB1'].value_counts(normalize=True)

bb1_enrichment = (hit_bb1_freqs / all_bb1_counts).dropna().sort_values(ascending=False)
print(bb1_enrichment.head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select top 10 enriched BB3s
top_bb3_enrichment = bb3_enrichment.head(20)

plt.figure(figsize=(10, 6))
sns.barplot(
    x=top_bb3_enrichment.values,
    y=top_bb3_enrichment.index,
    palette='viridis'
)

plt.xlabel('Enrichment (Hit Freq / Background Freq)')
plt.ylabel('BB3 Component')
plt.title('Top Enriched BB3 Components (fold change)')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# All BB3s
all_bb3s = df_filtered['BB3'].astype(str)

# Top 100 BB3s (hits)
hit_bb3s = top_hits_logfold_change['BB3'].astype(str)

# Background BB3s (not in hits)
background_bb3s = all_bb3s[~all_bb3s.isin(hit_bb3s)]

# Normalize: get frequency (i.e., proportion) per BB3
hit_freqs = hit_bb3s.value_counts(normalize=True)
bg_freqs = background_bb3s.value_counts(normalize=True)

# Union of BB3s in both groups
bb3_all = set(hit_freqs.index) | set(bg_freqs.index)

# Compute enrichment: ratio of normalized frequencies
import numpy as np
enrichment = {
    bb3: np.log2((hit_freqs.get(bb3, 0) + 1e-6) / (bg_freqs.get(bb3, 1e-6)))
    for bb3 in bb3_all
}

# Convert to DataFrame and plot top enriched
enrichment_df = pd.Series(enrichment, name="Fold Enrichment (Normalized)").sort_values(ascending=False).head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x=enrichment_df.values, y=enrichment_df.index, palette='viridis')
plt.xlabel("Fold Enrichment (Hit vs Background, normalized)")
plt.ylabel("BB3")
plt.title("Top BB3s Enriched in Log2FC Hits vs Background")
plt.tight_layout()
plt.show()


# Discussion
This result also identified the same top three BB3's as the wilcoxon test: C_80,C_95, and C-194! The difference here is that the enrichment x axis has a more realisitc scale. This result is more along the lines of what i expected.

## BB3 is the most important component in binding.
To determine which building block position contributed most strongly to binding activity, I analyzed the enrichment of each component in BB1, BB2, and BB3 across all molecules. For each position, I computed how frequently each building block appeared among hits compared to its overall frequency in the dataset. Enrichment was defined as the ratio of hit frequency to background frequency, where a value greater than 1 indicates that a component is overrepresented among hits. I then visualized the distributions of these enrichment scores using a violin plot on a log scale. The results revealed that BB1 had a narrow distribution centered around 1, indicating little to no contribution to binding specificity. BB2 showed a slightly broader spread, suggesting a moderate role in binding. In contrast, BB3 displayed a highly skewed distribution with several components showing extreme enrichment, with scores exceeding 100. This indicates that BB3 components are disproportionately responsible for driving binding activity, making BB3 the most informative position.

In [None]:
def calc_enrichment(df, bb_col):
    all_freq = df[bb_col].value_counts(normalize=True)
    hit_freq = df[df['is_hit']][bb_col].value_counts(normalize=True)
    enrichment = (hit_freq / all_freq).fillna(0)
    return enrichment.sort_values(ascending=False)

bb1_enrich = calc_enrichment(df, 'BB1')
bb2_enrich = calc_enrichment(df, 'BB2')
bb3_enrich = calc_enrichment(df, 'BB3')


In [None]:
import seaborn as sns
import pandas as pd

enrich_df = pd.DataFrame({
    'BB1': bb1_enrich,
    'BB2': bb2_enrich,
    'BB3': bb3_enrich
}).melt(var_name='Position', value_name='Enrichment')

plt.figure(figsize=(8, 6))
sns.violinplot(data=enrich_df, x='Position', y='Enrichment', inner='quartile', cut=0)
plt.axhline(1, color='red', linestyle='--', label='No Enrichment')
plt.yscale('log')  
plt.title('Enrichment Distribution per BB Position')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
print(top_wilcoxon_hits.head())

In [None]:
from matplotlib_venn import venn3

plt.figure(figsize=(8, 6))
venn3(
    [log2fc_hits, wilcoxon_hits, xgboost_hits],
    set_labels=("Log2FC", "Wilcoxon", "XGBoost")
)
plt.title("Overlap of Top 100 Molecules by Method")
plt.tight_layout()
plt.savefig("venn_diagram.png", dpi=300)
plt.show()


In [None]:
log2fc_xgboost = (log2fc_hits & xgboost_hits) - wilcoxon_hits
wilcoxon_xgboost = (wilcoxon_hits & xgboost_hits) - log2fc_hits
# Get BB3s for Log2FC ∩ XGBoost
log2fc_xgboost_df = top_hits_logfold_change[top_hits_logfold_change['molecule'].isin(log2fc_xgboost)]

# Get BB3s for Wilcoxon ∩ XGBoost
wilcoxon_xgboost_df = top_wilcoxon_hits[top_wilcoxon_hits['molecule'].isin(wilcoxon_xgboost)]
import matplotlib.pyplot as plt
import seaborn as sns

# Plot side-by-side
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

# Log2FC ∩ XGBoost
sns.countplot(
    y=log2fc_xgboost_df['BB3'],
    order=log2fc_xgboost_df['BB3'].value_counts().index,
    ax=axes[0]
)
axes[0].set_title("BB3 Composition: Log2FC ∩ XGBoost")
axes[0].set_xlabel("Count")

# Wilcoxon ∩ XGBoost
sns.countplot(
    y=wilcoxon_xgboost_df['BB3'],
    order=wilcoxon_xgboost_df['BB3'].value_counts().index,
    ax=axes[1]
)
axes[1].set_title("BB3 Composition: Wilcoxon ∩ XGBoost")
axes[1].set_xlabel("Count")

plt.tight_layout()
plt.savefig("bb3_composition_log2fc_vs_wilcoxon_xgboost.png", dpi=300)
plt.show()


In [None]:
log2fc_xgboost_all = log2fc_hits & xgboost_hits
wilcoxon_xgboost_all = wilcoxon_hits & xgboost_hits
log2fc_xgboost_only = (log2fc_hits & xgboost_hits) - wilcoxon_hits
# Extract BB3s
log2fc_xgboost_df = top_hits_logfold_change[top_hits_logfold_change['molecule'].isin(log2fc_xgboost_all)]
wilcoxon_xgboost_df = top_wilcoxon_hits[top_wilcoxon_hits['molecule'].isin(wilcoxon_xgboost_all)]

# Count BB3s in both sets
bb3_counts_log2fc = log2fc_xgboost_df['BB3'].value_counts()
bb3_counts_wilcoxon = wilcoxon_xgboost_df['BB3'].value_counts()

# Combine into single dataframe
bb3_df = pd.DataFrame({
    'Log2FC ∩ XGBoost': bb3_counts_log2fc,
    'Wilcoxon ∩ XGBoost': bb3_counts_wilcoxon
}).fillna(0)

# Convert to integers
bb3_df = bb3_df.astype(int)

# Sort by total
bb3_df = bb3_df.loc[bb3_df.sum(axis=1).sort_values(ascending=False).index]

# Plot stacked bar
bb3_df.plot(kind='barh', stacked=True, figsize=(10, 6), colormap='Set2')
plt.xlabel("BB3 Count")
plt.ylabel("BB3")
plt.title("BB3 Composition in Molecules Found by Log2FC/XGBoost and Wilcoxon/XGBoost")
plt.tight_layout()
plt.savefig("bb3_composition_stacked.png", dpi=300)
plt.show()


In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

## extracts hits

In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
import plotly.express as px
import pandas as pd

# Intersection categories
log2fc_only = log2fc_hits - wilcoxon_hits - xgboost_hits
wilcoxon_only = wilcoxon_hits - log2fc_hits - xgboost_hits
xgboost_only = xgboost_hits - log2fc_hits - wilcoxon_hits
log2fc_wilcoxon = log2fc_hits & wilcoxon_hits - xgboost_hits
log2fc_xgboost = log2fc_hits & xgboost_hits - wilcoxon_hits
wilcoxon_xgboost = wilcoxon_hits & xgboost_hits - log2fc_hits
all_three = log2fc_hits & wilcoxon_hits & xgboost_hits
print("Log2FC only:", len(log2fc_only))
print("Wilcoxon only:", len(wilcoxon_only))
print("XGBoost only:", len(xgboost_only))
print("Log2FC + Wilcoxon:", len(log2fc_wilcoxon))
print("Log2FC + XGBoost:", len(log2fc_xgboost))
print("Wilcoxon + XGBoost:", len(wilcoxon_xgboost))
print("All three:", len(all_three))


In [None]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

data = pd.DataFrame({
    "category": [
        "Log2FC only", "Wilcoxon only", "XGBoost only",
        "Log2FC + Wilcoxon", "Log2FC + XGBoost",
        "Wilcoxon + XGBoost", "All three"
    ],
    "count": [
        len(log2fc_only), len(wilcoxon_only), len(xgboost_only),
        len(log2fc_wilcoxon), len(log2fc_xgboost),
        len(wilcoxon_xgboost), len(all_three)
    ]
})

fig = px.bar(data, x="category", y="count", title="Molecule Overlap by Method")
fig.show(renderer="notebook")


In [None]:
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import pandas as pd

# group_data already contains the mappings of category → list of molecules
group_data = {
    "Log2FC only": list(log2fc_only),
    "Wilcoxon only": list(wilcoxon_only),
    "XGBoost only": list(xgboost_only),
    "Log2FC + Wilcoxon": list(log2fc_wilcoxon),
    "Log2FC + XGBoost": list(log2fc_xgboost),
    "Wilcoxon + XGBoost": list(wilcoxon_xgboost),
    "All three": list(all_three)
}

# Create DataFrame with category, count, and molecules
data = pd.DataFrame({
    "category": list(group_data.keys()),
    "count": [len(mols) for mols in group_data.values()],
    "molecules": list(group_data.values())
})

app = Dash(__name__)

app.layout = html.Div([
    html.H3("Molecule Overlap Explorer"),
    dcc.Graph(
        id='barplot',
        figure=px.bar(data, x="category", y="count", title="Overlap by Method")
    ),
    html.Pre(id='molecule-list', style={"whiteSpace": "pre-wrap"})
])

@app.callback(
    Output("molecule-list", "children"),
    Input("barplot", "clickData")
)
def display_molecules(clickData):
    if clickData:
        cat = clickData["points"][0]["x"]
        mols = data[data["category"] == cat]["molecules"].values[0]
        return f"Molecules in '{cat}':\n" + "\n".join(mols)
    return "Click a bar to see the list of molecules."

# ✅ Run app inline in notebook
app.run(jupyter_mode="external", port=8050)


In [None]:
# Assume the log2fc_hits, wilcoxon_hits, and xgboost_hits sets are already defined and contain molecule strings
# such as 'CP_981_CP_304_CP_80'

# Helper function to extract BB3 (last component in the molecule string)
def extract_bb3(molecule_set):
    return set(mol.split("_")[-1] for mol in molecule_set)

# Compute intersections for molecules
log2fc_only = log2fc_hits - wilcoxon_hits - xgboost_hits
wilcoxon_only = wilcoxon_hits - log2fc_hits - xgboost_hits
xgboost_only = xgboost_hits - log2fc_hits - wilcoxon_hits
log2fc_wilcoxon = (log2fc_hits & wilcoxon_hits) - xgboost_hits
log2fc_xgboost = (log2fc_hits & xgboost_hits) - wilcoxon_hits
wilcoxon_xgboost = (wilcoxon_hits & xgboost_hits) - log2fc_hits
all_three = log2fc_hits & wilcoxon_hits & xgboost_hits
print(all_three)
# Now extract BB3s for each group
# bb3_groups = {
#     "Log2FC only": extract_bb3(log2fc_only),
#     "Wilcoxon only": extract_bb3(wilcoxon_only),
#     "XGBoost only": extract_bb3(xgboost_only),
#     "Log2FC + Wilcoxon": extract_bb3(log2fc_wilcoxon),
#     "Log2FC + XGBoost": extract_bb3(log2fc_xgboost),
#     "Wilcoxon + XGBoost": extract_bb3(wilcoxon_xgboost),
#     "All three": extract_bb3(all_three)
# }

# # Convert to DataFrame for visualization or inspection
# bb3_df = pd.DataFrame({
#     "category": list(bb3_groups.keys()),
#     "num_bb3s": [len(bb3s) for bb3s in bb3_groups.values()],
#     "bb3s": [sorted(bb3s) for bb3s in bb3_groups.values()]
# })


#print(bb3_df.head())

In [None]:
# from collections import defaultdict, Counter
import pandas as pd

# --- Build mapping: molecule → BB3
mol_to_bb3 = {}
for mol in log2fc_hits | wilcoxon_hits | xgboost_hits:
    bb3 = mol.split("_")[-1]
    mol_to_bb3[mol] = bb3

# --- Define method membership per molecule
mol_to_methods = defaultdict(set)
for mol in log2fc_hits:
    mol_to_methods[mol].add("Log2FC")
for mol in wilcoxon_hits:
    mol_to_methods[mol].add("Wilcoxon")
for mol in xgboost_hits:
    mol_to_methods[mol].add("XGBoost")

# --- Normalize method combinations into group labels
def normalize_label(methods_set):
    if methods_set == {"Log2FC"}:
        return "Log2FC only"
    elif methods_set == {"Wilcoxon"}:
        return "Wilcoxon only"
    elif methods_set == {"XGBoost"}:
        return "XGBoost only"
    elif methods_set == {"Log2FC", "Wilcoxon"}:
        return "Log2FC+Wilcoxon"
    elif methods_set == {"Log2FC", "XGBoost"}:
        return "Log2FC+XGBoost"
    elif methods_set == {"Wilcoxon", "XGBoost"}:
        return "Wilcoxon+XGBoost"
    elif methods_set == {"Log2FC", "Wilcoxon", "XGBoost"}:
        return "All three"
    return "Unknown"

# --- Assign BB3s into groupings with duplication (frequency-aware)
# Rebuild groupings using BB3 → method mappings (across any molecules)
bb3_to_methods = defaultdict(set)

for mol, methods in mol_to_methods.items():
    bb3 = mol_to_bb3[mol]
    bb3_to_methods[bb3].update(methods)

# Assign BB3s to correct groups based on method participation
final_bb3_groups = defaultdict(list)
for bb3, methods_set in bb3_to_methods.items():
    label = normalize_label(methods_set)
    final_bb3_groups[label].append(bb3)

# --- Define expected order
group_labels = [
    "Log2FC only", "Wilcoxon only", "XGBoost only",
    "Log2FC+Wilcoxon", "Log2FC+XGBoost", "Wilcoxon+XGBoost",
    "All three"
]

# --- Create DataFrame with BB3 counts per group
bb3_df = pd.DataFrame({
    "category": group_labels,
    "unique_bb3s": [len(set(final_bb3_groups[label])) for label in group_labels],
    "bb3_counts": [Counter(final_bb3_groups[label]) for label in group_labels]
})

# --- Build BB3 × Group frequency matrix
all_unique_bb3s = sorted(set(bb3 for bb3_list in final_bb3_groups.values() for bb3 in bb3_list))
df_counts = pd.DataFrame(index=all_unique_bb3s)

for group in group_labels:
    counter = Counter(final_bb3_groups[group])
    df_counts[group] = [counter.get(bb3, 0) for bb3 in all_unique_bb3s]


In [None]:
# import pandas as pd
# from collections import Counter
# import plotly.graph_objs as go
# from dash import Dash, dcc, html

# # ---- Step 1: Define molecule sets ----
# # These should already be defined earlier in your notebook:
# # log2fc_hits, wilcoxon_hits, xgboost_hits
# # (each is a set of strings like "CP_999_CP_200_CP_80")

# # ---- Step 2: Create molecule group intersections ----
# log2fc_only = log2fc_hits - wilcoxon_hits - xgboost_hits
# wilcoxon_only = wilcoxon_hits - log2fc_hits - xgboost_hits
# xgboost_only = xgboost_hits - log2fc_hits - wilcoxon_hits
# log2fc_wilcoxon = (log2fc_hits & wilcoxon_hits) - xgboost_hits
# log2fc_xgboost = (log2fc_hits & xgboost_hits) - wilcoxon_hits
# wilcoxon_xgboost = (wilcoxon_hits & xgboost_hits) - log2fc_hits
# all_three = log2fc_hits & wilcoxon_hits & xgboost_hits

# # ---- Step 3: Extract BB3s from molecules in each group ----
# def extract_bb3s(molecule_set):
#     return [mol.split("_")[-1] for mol in molecule_set]

# final_bb3_groups = {
#     "Log2FC only": extract_bb3s(log2fc_only),
#     "Wilcoxon only": extract_bb3s(wilcoxon_only),
#     "XGBoost only": extract_bb3s(xgboost_only),
#     "Log2FC+Wilcoxon": extract_bb3s(log2fc_wilcoxon),
#     "Log2FC+XGBoost": extract_bb3s(log2fc_xgboost),
#     "Wilcoxon+XGBoost": extract_bb3s(wilcoxon_xgboost),
#     "All three": extract_bb3s(all_three)
# }

# # ---- Step 4: Count BB3 frequencies and prepare heatmap matrix ----
# all_unique_bb3s = sorted(set(bb3 for bb3s in final_bb3_groups.values() for bb3 in bb3s))
# df_counts = pd.DataFrame(index=all_unique_bb3s)

# for group, bb3s in final_bb3_groups.items():
#     counter = Counter(bb3s)
#     df_counts[group] = [counter.get(bb3, 0) for bb3 in all_unique_bb3s]

# # ---- Step 5: Create Dash App ----
# app = Dash(__name__)

# app.layout = html.Div([
#     html.H3("BB3 Frequency Heatmap"),
#     dcc.Graph(
#         id='heatmap',
#         figure=go.Figure(
#             data=go.Heatmap(
#                 z=df_counts.values,
#                 x=df_counts.columns,
#                 y=df_counts.index,
#                 colorscale='Viridis',
#                 colorbar=dict(title="Frequency"),
#                 hovertemplate="<b>BB3:</b> %{y}<br><b>Group:</b> %{x}<br><b>Count:</b> %{z}<extra></extra>"
#             )
#         ).update_layout(
#             xaxis_title="Group",
#             yaxis_title="BB3",
#             title="BB3 Frequencies Across Method Intersections",
#             hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial")
#         )
#     )
# ])

# app.run(jupyter_mode="external", port=8050)


In [None]:
# from dash import Dash, dcc, html, Input, Output
# import plotly.express as px
# import pandas as pd

# # Helper: extract BB3s from molecule strings
# # Helper: extract BB3s from molecule strings (correctly)
# def extract_bb3(molecule_list):
#     return sorted(set(["_".join(mol.split("_")[-1:]) for mol in molecule_list]))

# # Convert molecule groups to BB3 sets
# bb3_group_data = {
#     category: extract_bb3(mols)
#     for category, mols in group_data.items()
# }

# # Create DataFrame with category, count, and BB3s
# bb3_data = pd.DataFrame({
#     "category": list(bb3_group_data.keys()),
#     "count": [len(bb3s) for bb3s in bb3_group_data.values()],
#     "bb3s": list(bb3_group_data.values())
# })

# app = Dash(__name__)

# app.layout = html.Div([
#     html.H3("BB3 Component Overlap Explorer"),
#     dcc.Graph(
#         id='barplot',
#         figure=px.bar(bb3_data, x="category", y="count", title="Unique BB3s by Method Overlap")
#     ),
#     html.Pre(id='bb3-list', style={"whiteSpace": "pre-wrap"})
# ])

# @app.callback(
#     Output("bb3-list", "children"),
#     Input("barplot", "clickData")
# )
# def display_bb3s(clickData):
#     if clickData:
#         cat = clickData["points"][0]["x"]
#         bb3s = bb3_data[bb3_data["category"] == cat]["bb3s"].values[0]
#         return f"BB3s in '{cat}':\n" + "\n".join(bb3s)
#     return "Click a bar to see the list of BB3s."

# app.run(jupyter_mode="external", port=8050)
# from collections import Counter

# # Extract all BB3s (with duplicates) from molecules
# def extract_all_bb3s(molecule_set):
#     return [mol.split("_")[-1] for mol in molecule_set]

# # Keep full list (including repetitions) for each group
# bb3_groups = {
#     "Log2FC only": extract_all_bb3s(log2fc_only),
#     "Wilcoxon only": extract_all_bb3s(wilcoxon_only),
#     "XGBoost only": extract_all_bb3s(xgboost_only),
#     "Log2FC + Wilcoxon": extract_all_bb3s(log2fc_wilcoxon),
#     "Log2FC + XGBoost": extract_all_bb3s(log2fc_xgboost),
#     "Wilcoxon + XGBoost": extract_all_bb3s(wilcoxon_xgboost),
#     "All three": extract_all_bb3s(all_three)
# }

# # For optional summary stats
# bb3_df = pd.DataFrame({
#     "category": list(bb3_groups.keys()),
#     "num_bb3s": [len(bb3s) for bb3s in bb3_groups.values()],
#     "unique_bb3s": [len(set(bb3s)) for bb3s in bb3_groups.values()],
#     "bb3_counts": [Counter(bb3s) for bb3s in bb3_groups.values()]
# })

app = Dash(__name__)

app.layout = html.Div([
    html.H3("BB3 Component Overlap Explorer"),
    dcc.Graph(
        id='barplot',
        figure=px.bar(bb3_data, x="category", y="count", title="Unique BB3s by Method Overlap")
    ),
    html.Pre(id='bb3-list', style={"whiteSpace": "pre-wrap"})
])

@app.callback(
    Output("bb3-list", "children"),
    Input("barplot", "clickData")
)
def display_bb3s(clickData):
    if clickData:
        cat = clickData["points"][0]["x"]
        bb3_counter = bb3_df[bb3_df["category"] == cat]["bb3_counts"].values[0]
        bb3_display = [f"{bb3}: {count}" for bb3, count in bb3_counter.items()]
        return f"BB3 frequencies in '{cat}':\n" + "\n".join(bb3_display)
    return "Click a bar to see the list of BB3s."

app.run(jupyter_mode="external", port=8050)

In [None]:
from collections import defaultdict, Counter
import pandas as pd

# Step 1: Build BB3 → methods mapping (not molecule-based)
bb3_to_methods = defaultdict(set)

for mol in log2fc_hits:
    bb3 = mol.split("_")[-1]
    bb3_to_methods[bb3].add("Log2FC")

for mol in wilcoxon_hits:
    bb3 = mol.split("_")[-1]
    bb3_to_methods[bb3].add("Wilcoxon")

for mol in xgboost_hits:
    bb3 = mol.split("_")[-1]
    bb3_to_methods[bb3].add("XGBoost")

# Step 2: Normalize method sets to group labels
def normalize_label(methods_set):
    if methods_set == {"Log2FC"}:
        return "Log2FC only"
    elif methods_set == {"Wilcoxon"}:
        return "Wilcoxon only"
    elif methods_set == {"XGBoost"}:
        return "XGBoost only"
    elif methods_set == {"Log2FC", "Wilcoxon"}:
        return "Log2FC+Wilcoxon"
    elif methods_set == {"Log2FC", "XGBoost"}:
        return "Log2FC+XGBoost"
    elif methods_set == {"Wilcoxon", "XGBoost"}:
        return "Wilcoxon+XGBoost"
    elif methods_set == {"Log2FC", "Wilcoxon", "XGBoost"}:
        return "All three"
    return "Unknown"

# Step 3: Assign BB3s to their group
final_bb3_groups = defaultdict(list)

for bb3, methods in bb3_to_methods.items():
    label = normalize_label(methods)
    final_bb3_groups[label].append(bb3)

# Step 4: Count BB3 frequencies for each group
group_labels = [
    "Log2FC only", "Wilcoxon only", "XGBoost only",
    "Log2FC+Wilcoxon", "Log2FC+XGBoost", "Wilcoxon+XGBoost", "All three"
]

bb3_df = pd.DataFrame({
    "category": group_labels,
    "unique_bb3s": [len(set(final_bb3_groups[label])) for label in group_labels],
    "bb3_counts": [Counter(final_bb3_groups[label]) for label in group_labels]
})

# Step 5: Build heatmap matrix
all_unique_bb3s = sorted(set(bb3 for bb3_list in final_bb3_groups.values() for bb3 in bb3_list))
df_counts = pd.DataFrame(index=all_unique_bb3s, columns=group_labels).fillna(0)

for group in group_labels:
    bb3_list = final_bb3_groups.get(group, [])
    count = Counter(bb3_list)
    df_counts[group] = [count.get(bb3, 0) for bb3 in all_unique_bb3s]


In [None]:
from dash import Dash, dcc, html
import plotly.graph_objs as go

# Step: Track all molecules associated with each BB3
bb3_to_molecules = defaultdict(set)
for mol in log2fc_hits | wilcoxon_hits | xgboost_hits:
    bb3 = mol.split("_")[-1]
    bb3_to_molecules[bb3].add(mol)

# Step: Create hover text with molecule list
hover_text = []
for bb3 in df_counts.index:
    row = []
    mols = sorted(bb3_to_molecules.get(bb3, []))
    mol_text = "<br>".join(mols) if mols else "No associated molecules"
    for group in df_counts.columns:
        count = df_counts.loc[bb3, group]
        label = f"<b>BB3:</b> {bb3}<br><b>Group:</b> {group}<br><b>Count:</b> {count}<br><b>Molecules:</b><br>{mol_text}"
        row.append(label)
    hover_text.append(row)

# Step: Initialize Dash app with hover tooltip
app = Dash(__name__)

app.layout = html.Div([
    html.H3("BB3 Frequency Heatmap"),
    dcc.Graph(
        id='heatmap',
        figure=go.Figure(
            data=go.Heatmap(
                z=df_counts.values,
                x=df_counts.columns,
                y=df_counts.index,
                text=hover_text,
                hoverinfo="text",
                colorscale="Viridis",
                colorbar=dict(title="Frequency")
            )
        ).update_layout(
            xaxis_title="Group",
            yaxis_title="BB3",
            title="BB3 Frequencies Across Method Intersections",
            hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial")
        )
    )
])

app.run(jupyter_mode="external", port=8050)


In [None]:
import pandas as pd
from collections import Counter
import plotly.graph_objs as go
from dash import Dash, dcc, html

# BB3 groups
# bb3_groups = {
#     "Log2FC only": ['80', '80', '194', '95'],
#     "Wilcoxon only": ['80', '1058', '95', '95'],
#     "XGBoost only": ['194', '194', '80'],
#     "Log2FC + Wilcoxon": ['80', '95'],
#     "Log2FC + XGBoost": ['80', '194', '95'],
#     "Wilcoxon + XGBoost": ['95', '194'],
#     "All three": ['80', '194']
# }

# Create Dash app
app = Dash(__name__)

app.layout = html.Div([
    html.H3("BB3 Frequency Heatmap"),
    dcc.Graph(
        id='heatmap',
        figure=go.Figure(
            data=go.Heatmap(
                z=df_counts.values,
                x=df_counts.columns,
                y=df_counts.index,
                colorscale='Viridis',
                colorbar=dict(title="Frequency"),
                hovertemplate="<b>BB3:</b> %{y}<br><b>Group:</b> %{x}<br><b>Count:</b> %{z}<extra></extra>"
            )
        ).update_layout(
            xaxis_title="Group",
            yaxis_title="BB3",
            title="BB3 Frequencies Across Method Intersections",
            hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial")
        )
    )
])

app.run(jupyter_mode="external", port=8050)


In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
print("Log2FC ∩ Wilcoxon:", len(log2fc_hits & wilcoxon_hits))
print("Log2FC ∩ XGBoost:", len(log2fc_hits & xgboost_hits))
print("Wilcoxon ∩ XGBoost:", len(wilcoxon_hits & xgboost_hits))


from collections import Counter

# Combine all sets into one flat list
combined = list(log2fc_hits) + list(wilcoxon_hits) + list(xgboost_hits)

# Count how many times each molecule appears
mol_counts = Counter(combined)

# Keep only those that appear in at least 2 of the 3 sets
high_conf_2plus = [mol for mol, count in mol_counts.items() if count >= 2]

print(f"High-confidence molecules (≥2 methods): {len(high_conf_2plus)}")
print("Examples:", high_conf_2plus[:5])


In [None]:
def extract_bb3_from_set(df_source, molecule_set):
    return df_source[df_source['molecule'].isin(molecule_set)]['BB3'].value_counts().reset_index(name='count').rename(columns={'index': 'BB3'})
fc_bb3 = extract_bb3_from_set(top_hits_logfold_change, log2fc_hits)
wilcoxon_bb3 = extract_bb3_from_set(top_hits_wilcoxon_hits, wilcoxon_hits)
seen_bb3 = extract_bb3_from_set(top_100_seen, all_molecules)
unseen_bb3 = extract_bb3_from_set(top_100_unseen, unseen['molecule'])
seen['BB3'] = seen['molecule'].str.split("_").str[-1]


def normalize_counts(df):
    df['percent'] = 100 * df['count'] / df['count'].sum()
    return df

fc_bb3 = normalize_counts(fc_bb3)
wilcoxon_bb3 = normalize_counts(wilcoxon_bb3)
seen_bb3 = normalize_counts(seen_bb3)
unseen_bb3 = normalize_counts(unseen_bb3)
bb3_summary = fc_bb3.merge(wilcoxon_bb3, on='BB3', how='outer', suffixes=('_log2fc', '_wilcoxon'))
bb3_summary = bb3_summary.merge(seen_bb3, on='BB3', how='outer')
bb3_summary = bb3_summary.merge(unseen_bb3, on='BB3', how='outer', suffixes=('_seen', '_unseen'))
bb3_summary = bb3_summary.fillna(0)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Melt for plotting
melted = bb3_summary.melt(id_vars='BB3', value_vars=[
    'percent_log2fc', 'percent_wilcoxon', 'percent_seen', 'percent_unseen'],
    var_name='Source', value_name='Percent')

plt.figure(figsize=(12, 6))
sns.barplot(data=melted, x='BB3', y='Percent', hue='Source')
plt.xticks(rotation=45)
plt.title("BB3 Representation Across Hit Calling and Prediction Methods")
plt.tight_layout()
plt.show()


In [None]:
# Compute average percent across all sources
bb3_summary['avg_percent'] = bb3_summary[
    ['percent_log2fc', 'percent_wilcoxon', 'percent_seen_predicted', 'percent_unseen_predicted']
].mean(axis=1)

# Keep top N BB3s
top_bb3s = bb3_summary.sort_values(by='avg_percent', ascending=False).head(10)['BB3']

# Filter melted dataframe
melted_filtered = melted[melted['BB3'].isin(top_bb3s)]


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=melted_filtered, x='BB3', y='Percent', hue='Source')
plt.xticks(rotation=45)
plt.title("Top BB3s Across Methods (Filtered)")
plt.tight_layout()
plt.show()


In [None]:
# Calculate spread across methods
bb3_summary['percent_std'] = bb3_summary[
    ['percent_log2fc', 'percent_wilcoxon', 'percent_seen', 'percent_unseen']
].std(axis=1)

# Take top 10 BB3s by variability
top_varied_bb3s = bb3_summary.sort_values(by='percent_std', ascending=False).head(10)['BB3']

# Filter melted dataframe
melted_filtered = melted[melted['BB3'].isin(top_varied_bb3s)]
plt.figure(figsize=(10, 6))
sns.barplot(data=melted_filtered, x='BB3', y='Percent', hue='Source')
plt.xticks(rotation=45)
plt.title("Top BB3s Across Methods")
plt.tight_layout()
plt.savefig("topBB3s_across_methods.png", dpi=300)

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# --- Filter out BB3s with negligible representation ---
bb3_percent_matrix = bb3_summary[
    ['BB3', 'percent_log2fc', 'percent_wilcoxon', 'percent_seen', 'percent_unseen']
].set_index('BB3')

# Optional: Keep only BB3s with any method > 5% to reduce clutter
bb3_percent_matrix = bb3_percent_matrix[bb3_percent_matrix.max(axis=1) > 5]

# --- Standardize columns (optional, good for clustering) ---
# from sklearn.preprocessing import StandardScaler
# bb3_percent_matrix_scaled = pd.DataFrame(
#     StandardScaler().fit_transform(bb3_percent_matrix),
#     index=bb3_percent_matrix.index,
#     columns=bb3_percent_matrix.columns
# )

# --- Create clustered heatmap ---
plt.figure(figsize=(10, 8))
sns.clustermap(
    bb3_percent_matrix,
    cmap="YlGnBu",
    linewidths=0.5,
    figsize=(10, 10),
    standard_scale=1,  # Normalize per column
    annot=True,        # Optional: show actual % values
    fmt=".1f"
)

plt.suptitle("BB3 Representation Across Hit Calling and Prediction Methods", fontsize=14)
plt.savefig("bb3_clustermap.png", dpi=300)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Example: build BB3 frequency per method
def get_bb3_freq(molecules):
    bb3s = ["_".join(m.split("_")[-2:]) for m in molecules]
    return pd.Series(bb3s).value_counts(normalize=True) * 100  # percentage

df = pd.DataFrame({
    "Log2FC": get_bb3_freq(log2fc_hits),
    "Wilcoxon": get_bb3_freq(wilcoxon_hits),
    "XGBoost": get_bb3_freq(xgboost_hits)
}).fillna(0)

df = df.sort_values("Log2FC", ascending=False).head(15)  # limit to top BB3s for clarity

df.plot(kind="bar", figsize=(10, 6))
plt.ylabel("Percent of Hits (%)")
plt.title("Top BB3s by Relative Frequency in Each Method")
plt.tight_layout()
plt.savefig("bb3_strength_barplot.png", dpi=300)
plt.show()


In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
unseen[unseen['BB3'] == 'CP_1002'].sort_values(by='final_intensity', ascending=False)


In [None]:
import plotly.express as px

# Filter melted dataframe as before
melted_filtered = melted[melted['BB3'].isin(top_varied_bb3s)]

# Create interactive bar chart
fig = px.bar(
    melted_filtered,
    x='BB3',
    y='Percent',
    color='Source',
    title='Top BB3s Across Methods (Interactive)',
    hover_data=['BB3', 'Source', 'Percent'],
    barmode='group',
    labels={'Percent': 'Percent of Hits'}
)

fig.update_layout(
    xaxis_tickangle=-45,
    height=600,
    width=900
)

fig.show()


In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
import numpy as np
import pandas as pd

# Step 1: Extract BB3s
all_bb3s = df_filtered['BB3'].astype(str)
hit_bb3s = top_hits_logfold_change['BB3'].astype(str)

# Step 2: Define background as all BB3s NOT in top hits
background_bb3s = all_bb3s[~all_bb3s.isin(hit_bb3s)]

# Step 3: Get normalized frequency distributions (i.e., proportions)
hit_freqs = hit_bb3s.value_counts(normalize=True)
bg_freqs = background_bb3s.value_counts(normalize=True)

# Step 4: Union of all BB3s
bb3_all = set(hit_freqs.index) | set(bg_freqs.index)

# Step 5: Compute log2 enrichment
log2_enrichment = {
    bb3: np.log2((hit_freqs.get(bb3, 0) + 1e-6) / (bg_freqs.get(bb3, 1e-6)))
    for bb3 in bb3_all
}


In [None]:
print(bb3_summary[['percent_log2fc', 'percent_wilcoxon', 'percent_seen', 'percent_unseen']].describe())


In [None]:
print("BB3 Summary shape:", bb3_summary.shape)
print("Top varied BB3s:", top_varied_bb3s.tolist())
print("Melted shape:", melted.shape)
print("Filtered shape:", melted_filtered.shape)


In [None]:
# Proprietary data loading removed.
# df = pd.read_csv('proprietary_dataset.csv')

import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({
    'BB1': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'BB2': np.random.choice(['E', 'F', 'G', 'H'], 100),
    'BB3': np.random.choice(['I', 'J', 'K', 'L'], 100),
    'fluorescence': np.random.normal(1000, 200, 100)
})
df.head()

In [None]:
print("Melted BB3s:", melted['BB3'].unique()[:5])
print("Top enriched BB3s:", top_enriched_bb3s[:5])