In [None]:
import pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt
import seaborn as sns

codon97_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
protein_df = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")
rna_df     = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")

# Get the length of each cds
def get_len(seq):
    if pd.isna(seq):
        return np.nan
    s = str(seq).replace(" ", "").replace("\n", "").upper()
    return len(s) if len(s) > 0 else np.nan

codon97_df["CDS_len"] = codon97_df["CDS"].apply(get_len)
protein_df["CDS_len"] = protein_df["CDS"].apply(get_len)
rna_df["CDS_len"]     = rna_df["CDS"].apply(get_len)

# Parse FASTA for AllProt
records = []
for rec in SeqIO.parse("/mnt/work_3/sijin/CAI/sequence.txt", "fasta"):
    nm_id = rec.description.split("_cds")[0].split("|")[-1]
    seq = str(rec.seq).upper()
    records.append({"NM_ID": nm_id, "CDS_len": len(seq)})

allprot_df = pd.DataFrame(records)

# drop NAs

codon97_len = codon97_df["CDS_len"].dropna().astype(float).values
prot_len    = protein_df["CDS_len"].dropna().astype(float).values
rna_len     = rna_df["CDS_len"].dropna().astype(float).values
all_len     = allprot_df["CDS_len"].dropna().astype(float).values

# log transform
codon97_log = np.log10(codon97_len)
prot_log    = np.log10(prot_len)
rna_log     = np.log10(rna_len)
all_log     = np.log10(all_len)


#### bootstrap test

In [None]:
import numpy as np

def bootstrap_pvalue_mean(test_vals, all_vals, n_iter=10000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    all_mean = np.mean(all_vals)

    pval = np.mean(np.abs(boot_means - all_mean) >= np.abs(test_mean - all_mean))
    return test_mean, all_mean, pval

codon97_mean, all_mean1, p_cod_all = bootstrap_pvalue_mean(codon97_log, all_log)
prot_mean,    all_mean2, p_prot_all = bootstrap_pvalue_mean(prot_log, all_log)
rna_mean,     all_mean3, p_rna_all  = bootstrap_pvalue_mean(rna_log, all_log)

res_df = pd.DataFrame({
    "Group": ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0"],
    "Mean_log10_Group": [codon97_mean, prot_mean, rna_mean],
    "Mean_log10_AllProt": [all_mean1, all_mean2, all_mean3],
    "p-value (two-sided)": [p_cod_all, p_prot_all, p_rna_all]
})

print("\n=== Bootstrap mean test (log10 CDS length): each group vs AllProt ===")
print(res_df)


#### Plot

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import matplotlib as mpl

all_data = pd.DataFrame({
    "log10_Length": np.concatenate([codon97_log, prot_log, rna_log, all_log]),
    "Source": (["Codon97"] * len(codon97_log) +
               ["Proteomics_Log2FC>0"] * len(prot_log) +
               ["RNAseq_Log2FC>0"] * len(rna_log) +
               ["AllProt"] * len(all_log))
})

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "AllProt"]

custom_palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0": "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "AllProt": "#eab375"
}

counts = all_data.groupby("Source").size()

plt.figure(figsize=(10, 6))
ax = sns.violinplot(
    x="Source", y="log10_Length", data=all_data,
    inner=None, palette=custom_palette,
    order=order, linewidth=1.2, cut=0
)

sns.stripplot(
    x="Source", y="log10_Length", data=all_data,
    order=order, color="black", alpha=0.2,
    jitter=0.25, size=1.5
)

sns.boxplot(
    x="Source", y="log10_Length", data=all_data,
    order=order, width=0.15, showcaps=True,
    boxprops={'facecolor': 'grey', 'edgecolor': 'black'},
    whiskerprops={'color': 'black'},
    medianprops={'color': 'black', 'linewidth': 1},
    showfliers=False
)

# means
group_means = all_data.groupby("Source")["log10_Length"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", edgecolors="black", zorder=3, s=30)

# p-value label
y_max = all_data["log10_Length"].max()

def add_bracket(x1, x2, y, h, text, fs=10):
    plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], c="k", lw=1.2)
    plt.text((x1+x2)/2, y+h+0.02, text, ha="center", va="bottom", fontsize=fs, color="black")

base_y = y_max + 0.10
step_y = 0.35
h = 0.10

add_bracket(0, 3, base_y + 0*step_y, h, f"p = {p_cod_all:.4f}")
add_bracket(1, 3, base_y + 1*step_y, h, f"p = {p_prot_all:.4f}*")
add_bracket(2, 3, base_y + 2*step_y, h, f"p = {p_rna_all:.4f}*")

ax.set_ylim(2, max(6, base_y + 2*step_y + h + 0.25))

plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts.get(src, 0)})" for src in order]
)

plt.title("CDS Length Distribution (log10-transformed)", fontsize=14)
plt.ylabel("log10(CDS Length)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()

plt.show()
