### File

In [None]:
import pandas as pd
rna_seq_df  = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")
prot_seq_df = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")

### 1. Find CAI for RNA level file

In [None]:
import pandas as pd
from Bio import SeqIO
from math import log, exp

# Kazusa Codon usage 
# https://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=9606
kazusa_freq = {
    "UUU": 17.6, "UUC": 20.3, "UUA": 7.7, "UUG": 12.9,
    "CUU": 13.2, "CUC": 19.6, "CUA": 7.2, "CUG": 39.6,
    "AUU": 16.0, "AUC": 20.8, "AUA": 7.5, "AUG": 22.0,
    "GUU": 11.0, "GUC": 14.5, "GUA": 7.1, "GUG": 28.1,
    "UCU": 15.2, "UCC": 17.7, "UCA": 12.2, "UCG": 4.4,
    "CCU": 17.5, "CCC": 19.8, "CCA": 16.9, "CCG": 6.9,
    "ACU": 13.1, "ACC": 18.9, "ACA": 15.1, "ACG": 6.1,
    "GCU": 18.4, "GCC": 27.7, "GCA": 15.8, "GCG": 7.4,
    "UAU": 12.2, "UAC": 15.3, "UAA": 1.0, "UAG": 0.8,
    "CAU": 10.9, "CAC": 15.1, "CAA": 12.3, "CAG": 34.2,
    "AAU": 17.0, "AAC": 19.1, "AAA": 24.4, "AAG": 31.9,
    "GAU": 21.8, "GAC": 25.1, "GAA": 29.0, "GAG": 39.6,
    "UGU": 10.6, "UGC": 12.6, "UGA": 1.6, "UGG": 13.2,
    "CGU": 4.5, "CGC": 10.4, "CGA": 6.2, "CGG": 11.4,
    "AGU": 12.1, "AGC": 19.5, "AGA": 12.2, "AGG": 12.0,
    "GGU": 10.8, "GGC": 22.2, "GGA": 16.5, "GGG": 16.5
}

aa_table = {
    'F': ['UUU', 'UUC'], 
    'L': ['UUA','UUG','CUU','CUC','CUA','CUG'],
    'I': ['AUU','AUC','AUA'], 'M': ['AUG'], 
    'V': ['GUU','GUC','GUA','GUG'],
    'S': ['UCU','UCC','UCA','UCG','AGU','AGC'], 
    'P': ['CCU','CCC','CCA','CCG'],
    'T': ['ACU','ACC','ACA','ACG'], 
    'A': ['GCU','GCC','GCA','GCG'],
    'Y': ['UAU','UAC'], 
    'H': ['CAU','CAC'], 
    'Q': ['CAA','CAG'],
    'N': ['AAU','AAC'], 
    'K': ['AAA','AAG'], 
    'D': ['GAU','GAC'],
    'E': ['GAA','GAG'], 
    'C': ['UGU','UGC'], 
    'W': ['UGG'],
    'R': ['CGU','CGC','CGA','CGG','AGA','AGG'], 
    'G': ['GGU','GGC','GGA','GGG']
}

# Compute relative codon weights
codon_weights = {}
for aa, codons in aa_table.items():
    mean_freq = sum(kazusa_freq[c] for c in codons if c in kazusa_freq) / len(codons)
    rscu = {c: kazusa_freq[c] / mean_freq for c in codons if c in kazusa_freq}
    max_rscu = max(rscu.values())
    for c in rscu:
        codon_weights[c] = rscu[c] / max_rscu


# to compute CAI
def compute_cai(seq):
    if pd.isna(seq): 
        return None
    seq = seq.upper().replace("T","U")
    codons = [seq[i:i+3] for i in range(0, len(seq), 3) if len(seq[i:i+3])==3]
    codons = [c for c in codons if c not in ["UAA","UAG","UGA"]]  # remove stop
    ws = [codon_weights[c] for c in codons if c in codon_weights]
    if not ws: 
        return None
    return exp(sum(log(w) for w in ws) / len(ws))


rna_seq_df["CAI"] = rna_seq_df["CDS"].apply(compute_cai)
rna_seq_df.to_csv("data/RNA_Proteomics_RNAlevel.csv", index=False)

### 2. CAI for the proteomics level file

In [None]:
prot_seq_df["CAI"] = prot_seq_df["CDS"].apply(compute_cai)
prot_seq_df.to_csv("data/RNA_Proteomics_Proteinlevel.csv", index=False)

### 3. CAI for all proteomics

In [None]:
results = []
for record in SeqIO.parse("sequence.txt", "fasta"):
    nm_id = record.id.split("|")[1] if "|" in record.id else record.id
    seq = str(record.seq)
    cai = compute_cai(seq)
    results.append({"NM_ID": nm_id, "CAI": cai})

cai_df = pd.DataFrame(results)
cai_df.to_csv("data/sequence_CAI.csv", index=False)

### 4. CAI for codon97

In [None]:
rna_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
rna_df["CAI"] = rna_df["CDS"].apply(compute_cai)

rna_df.to_csv("data/RNA_Proteomics_Filtered.csv", index=False)

### 5. Distribution for log2FC>1

In [None]:
rnalevel_file = "data/RNA_Proteomics_RNAlevel.csv"
RNA_file = pd.read_csv(rnalevel_file)

filtered = RNA_file[
    (RNA_file["log2FoldChange"] > 1) &
    (RNA_file["padj"] < 0.05)
]

out_file = "data/RNA_Proteomics_RNAlevel_LogFC>1.csv"
filtered.to_csv(out_file, index=False)

In [None]:
protlevel_file = "data/RNA_Proteomics_Proteinlevel.csv"
prot_file = pd.read_csv(protlevel_file)

filtered = prot_file[
    (prot_file["Log2FC"] > 1) &
    (prot_file["Welch's T-test q-value ZFC3H1 KO"] < 0.05)
]

out_file = "data/RNA_Proteomics_Protlevel_LogFC>1.csv"
filtered.to_csv(out_file, index=False)

### 6. Violin Plot for CAI distribution

#### Bootstrap for the mean value (compare with all_prot)

In [None]:
import numpy as np

def bootstrap_pvalue(test_vals, all_vals, n_iter=1000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)

    pval = np.mean(np.abs(boot_means - np.mean(all_vals)) >= np.abs(test_mean - np.mean(all_vals)))

    return test_mean, np.mean(all_vals), pval

results = {}
for label, values in {
    "Codon97": codon97_cai,
    "Proteomics_log2FC>0": protein_cai,
    "RNAseq_log2FC>0": rna_cai,
    "RNAseq_Log2fc>1": rna_log2FC1_cai,
    "Prot_Log2fc>1": prot_log2FC1_cai
}.items():
    results[label] = bootstrap_pvalue(values, all_cai, n_iter=1000)

res_df = pd.DataFrame(results, index=["Mean_Group","Mean_Allprot","p-value"]).T
print(res_df)


#### Violin Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Files
codon97_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
protein_df = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")
rna_df     = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")
all_df = pd.read_csv("data/sequence_CAI.csv")
rna_log2FC1_df = pd.read_csv("data/RNA_Proteomics_RNAlevel_LogFC>1.csv")
prot_log2FC1_df = pd.read_csv("data/RNA_Proteomics_Protlevel_LogFC>1.csv")

# Extract CAI coln in each source
codon97_cai = codon97_df["CAI"].dropna().tolist()
protein_cai = protein_df["CAI"].dropna().tolist()
rna_cai     = rna_df["CAI"].dropna().tolist()
all_cai = all_df["CAI"].dropna().tolist()
rna_log2FC1_cai = rna_log2FC1_df["CAI"].dropna().tolist()
prot_log2FC1_cai = prot_log2FC1_df["CAI"].dropna().tolist()

# Put into one dataframe
all_data = pd.DataFrame({
    "CAI": codon97_cai + protein_cai + rna_cai +all_cai+rna_log2FC1_cai+prot_log2FC1_cai,
    "Source": (["Codon97"] * len(codon97_cai) +
               ["Proteomics_Log2FC>0"] * len(protein_cai) +
               ["RNAseq_Log2FC>0"] * len(rna_cai)+
               ["Allprot_all_Log2FC"] * len(all_cai)+
               ["RNAseq_Log2fc>1"]* len(rna_log2FC1_cai)+
               ["Prot_Log2fc>1"]*len(prot_log2FC1_cai)) 
               
})

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "RNAseq_Log2fc>1", "Prot_Log2fc>1", "Allprot_all_Log2FC"]
counts = all_data.groupby("Source").size()

# Plot the violin plot
plt.figure(figsize=(12,6))

ax = sns.violinplot(x="Source", y="CAI", data=all_data, 
                    inner=None, palette="Set3", order=order,linewidth=1.2
                    )

# Add the jitter plot
sns.stripplot(x="Source", y="CAI", data=all_data,
              color="black", alpha=0.2, jitter=0.25, size=1.5, order=order)

# Add the box plot
sns.boxplot(
    x="Source", y="CAI", data=all_data,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False 
)

# Add group means
group_means = all_data.groupby("Source")["CAI"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=10, label="Mean" if i==0 else "")

# Add p-values
p_values = {
    "Codon97": 0.006,
    "Proteomics_Log2FC>0": 0.000,
    "RNAseq_Log2FC>0": 0.704,
    "RNAseq_Log2fc>1": 0.121,
    "Prot_Log2fc>1": 0.970
}
y_max = all_data["CAI"].max()
y_offset = 0.03  
for i, src in enumerate(order[:-1]):  
    x1, x2 = i, len(order)-1 
    pval = p_values.get(src, None)
    if pval is None: 
        continue
    y = y_max + (i+1)*y_offset
    plt.plot([x1, x1, x2, x2], [y, y+0.005, y+0.005, y], lw=1.2, c='k')
    plt.text((x1+x2)/2, y+0.01, f"p = {pval:.3f}", ha='center', va='bottom', fontsize=10, color="blue")

plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)
# Y range
plt.yticks(np.arange(0.6, 1.01, 0.05))  

plt.ylim(all_data["CAI"].min()-0.1, all_data["CAI"].max()+0.2)
plt.title("Distribution of CAI Across Sources", fontsize=14)
plt.ylabel("CAI", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()

# Save to svg format
import matplotlib as mpl
from pathlib import Path

mpl.rcParams['svg.fonttype'] = 'none'

outdir = Path("data/figs")
outdir.mkdir(parents=True, exist_ok=True)
plt.savefig(outdir / "CAI_violin.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()
