In [None]:
import pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt
import seaborn as sns

codon97_df = pd.read_csv("/mnt/work_3/sijin/CAI/RNA_Proteomics_Filtered.csv")
protein_df = pd.read_csv("/mnt/work_3/sijin/CAI/RNA_Proteomics_Proteinlevel.csv")
rna_df     = pd.read_csv("/mnt/work_3/sijin/CAI/RNA_Proteomics_RNAlevel.csv")

def get_len(seq):
    if pd.isna(seq):
        return None
    return len(str(seq).replace(" ", "").replace("\n", "").upper())

codon97_df["CDS_len"] = codon97_df["CDS"].apply(get_len)
protein_df["CDS_len"] = protein_df["CDS"].apply(get_len)
rna_df["CDS_len"]     = rna_df["CDS"].apply(get_len)

# pull the fasta
records = []
for rec in SeqIO.parse("/mnt/work_3/sijin/CAI/sequence.txt", "fasta"):
    nm_id = rec.description.split("_cds")[0].split("|")[-1]
    seq = str(rec.seq).upper()
    records.append({"NM_ID": nm_id, "CDS": seq, "CDS_len": len(seq)})

allprot_df = pd.DataFrame(records)

all_lengths = pd.DataFrame({
    "Length": pd.concat([
        codon97_df["CDS_len"],
        rna_df["CDS_len"],
        protein_df["CDS_len"],
        allprot_df["CDS_len"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"]   * len(codon97_df) +
        ["RNAseq"]    * len(rna_df) +
        ["Proteomics"]* len(protein_df) +
        ["AllProt"]   * len(allprot_df)
    )
})


In [None]:
# After sort the dataset, leave the outliers

allprot_trimmed = allprot_df.sort_values("CDS_len", ascending=False).iloc[2:]
rna_trimmed     = rna_df.sort_values("CDS_len", ascending=False).iloc[1:]

all_lengths_trimmed = pd.DataFrame({
    "Length": pd.concat([
        codon97_df["CDS_len"],
        rna_trimmed["CDS_len"],
        protein_df["CDS_len"],
        allprot_trimmed["CDS_len"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"]   * len(codon97_df) +
        ["RNAseq_Log2FC>0"]    * len(rna_trimmed) +
        ["Proteomics_Log2FC>0"]* len(protein_df) +
        ["All_otherprot_all_Log2FC"]   * len(allprot_trimmed)
    )
})

custom_palette = {
    "Codon97": "#8DD3C7", 
    "Proteomics_Log2FC>0": "#f6f6bc",               
    "RNAseq_Log2FC>0": "#c2bed6",
    "All_otherprot_all_Log2FC": "#eab375"  
}

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = all_lengths_trimmed.groupby("Source").size()

#### bootstrap test

In [None]:
import numpy as np
import pandas as pd
def bootstrap_pvalue(test_vals, all_vals, n_iter=1000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    pval = np.mean(np.abs(boot_means - np.mean(all_vals)) >= 
                   np.abs(test_mean - np.mean(all_vals)))

    return test_mean, np.mean(all_vals), pval


def bootstrap_pvalue_median(test_vals, all_vals, n_iter=1000, seed=123):
    np.random.seed(seed)
    test_median = np.median(test_vals)
    n = len(test_vals)

    boot_medians = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_medians.append(np.median(sample))

    boot_medians = np.array(boot_medians)
    pval = np.mean(np.abs(boot_medians - np.median(all_vals)) >= 
                   np.abs(test_median - np.median(all_vals)))

    return test_median, np.median(all_vals), pval

allprot_trimmed = allprot_df.sort_values("CDS_len", ascending=False).iloc[2:]
rna_trimmed     = rna_df.sort_values("CDS_len", ascending=False).iloc[1:]

codon97_len = codon97_df["CDS_len"].dropna().values
rna_len     = rna_trimmed["CDS_len"].dropna().values
protein_len = protein_df["CDS_len"].dropna().values
all_len     = allprot_trimmed["CDS_len"].dropna().values   

# mean
results_mean = {}
for label, values in {
    "Codon97": codon97_len,
    "RNAseq": rna_len,
    "Proteomics": protein_len
}.items():
    results_mean[label] = bootstrap_pvalue(values, all_len, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_Allprot","p-value"]).T
print("=== Mean comparison (trimmed) ===")
print(res_df_mean)

# median
results_median = {}
for label, values in {
    "Codon97": codon97_len,
    "RNAseq": rna_len,
    "Proteomics": protein_len
}.items():
    results_median[label] = bootstrap_pvalue_median(values, all_len, n_iter=1000)

res_df_median = pd.DataFrame(results_median, index=["Median_Group","Median_Allprot","p-value"]).T
print("\n=== Median comparison (trimmed) ===")
print(res_df_median)

#### Plot

In [None]:
import math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y="Length", log_scale=True, data=all_lengths_trimmed,
    inner=None, palette=custom_palette, order=order, linewidth=1.2
)

sns.stripplot(
    x="Source", y="Length", data=all_lengths_trimmed,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

sns.boxplot(
    x="Source", y="Length", data=all_lengths_trimmed,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

group_means = all_lengths_trimmed.groupby("Source")["Length"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]*1.05, f"mean = {group_means[src]:.0f}", 
            ha='center', va='bottom', fontsize=10, color="blue")

p_values = {
    "Codon97": 0.659,
    "RNAseq_Log2FC>0": 0.307,
    "Proteomics_Log2FC>0": 0.000
}

y_max = all_lengths_trimmed["Length"].max()
base = math.log10(y_max)

log_step = 0.4  

for i, src in enumerate(order[:-1]):  
    x1, x2 = i, len(order)-1
    pval = p_values.get(src, None)
    if pval is None:
        continue
    
    y = 10**(base + (i+1)*log_step)
    line_height = 10**(base + (i+1)*log_step + 0.05)
    
    plt.plot([x1, x1, x2, x2], [y, line_height, line_height, y],
             lw=1.2, c='k')
    plt.text((x1+x2)/2, line_height*1.1, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")

plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

y_min, y_max = ax.get_ylim()
ax.set_ylim(0, 1e6) 
plt.title("CDS Length Distribution (outliers removed, log10 scale)", fontsize=14)
plt.ylabel("CDS Length (nt, log10)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()

plt.show()
