### Find 5' and 3' for the cds

In [None]:
import pandas as pd

codon97_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
rna_df     = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")
protein_df = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")

In [None]:
def extract_subseq(seq, end="5", length=150):
    if pd.isna(seq):   # if CDS is missing
        return ""
    seq = str(seq).upper().replace(" ", "").replace("\n", "")
    if end == "5":
        return seq[:length] if len(seq) >= length else seq
    elif end == "3":
        return seq[-length:] if len(seq) >= length else seq
    else:
        raise ValueError("end must be '5' or '3'")

In [None]:
# === Codon97 ===
codon97_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
codon97_df["CDS_5end"] = codon97_df["CDS"].apply(lambda x: extract_subseq(x, "5", 150))
codon97_df["CDS_3end"] = codon97_df["CDS"].apply(lambda x: extract_subseq(x, "3", 150))
codon97_df.to_csv("data/codon97_with_ends.csv", index=False)

# === RNA-seq targets ===
rna_df = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")
rna_df["CDS_5end"] = rna_df["CDS"].apply(lambda x: extract_subseq(x, "5", 150))
rna_df["CDS_3end"] = rna_df["CDS"].apply(lambda x: extract_subseq(x, "3", 150))
rna_df.to_csv("data/rna_with_ends.csv", index=False)

# === Proteomics targets ===
protein_df = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")
protein_df["CDS_5end"] = protein_df["CDS"].apply(lambda x: extract_subseq(x, "5", 150))
protein_df["CDS_3end"] = protein_df["CDS"].apply(lambda x: extract_subseq(x, "3", 150))
protein_df.to_csv("data/protein_with_ends.csv", index=False)

#### For all_prot's 5' and 3' cds

In [None]:
from Bio import SeqIO
import pandas as pd

records = []
for rec in SeqIO.parse("data/sequence.txt", "fasta"):
    header = rec.description
    seq = str(rec.seq).upper()

    # Extract NM_ID
    nm_id = header.split("_cds")[0].split("|")[-1]

    records.append({
        "NM_ID": nm_id,
        "CDS": seq,
        "CDS_5end": extract_subseq(seq, "5", 150),
        "CDS_3end": extract_subseq(seq, "3", 150)
    })

# Save to DataFrame
allprot_df = pd.DataFrame(records)
allprot_df.to_csv("data/allprot_with_ends.csv", index=False)


#### Compute CAI for each file

In [None]:
import pandas as pd
from math import log, exp

# Kazusa codon usage
kazusa_freq = {
    "UUU": 17.6, "UUC": 20.3, "UUA": 7.7, "UUG": 12.9,
    "CUU": 13.2, "CUC": 19.6, "CUA": 7.2, "CUG": 39.6,
    "AUU": 16.0, "AUC": 20.8, "AUA": 7.5, "AUG": 22.0,
    "GUU": 11.0, "GUC": 14.5, "GUA": 7.1, "GUG": 28.1,
    "UCU": 15.2, "UCC": 17.7, "UCA": 12.2, "UCG": 4.4,
    "CCU": 17.5, "CCC": 19.8, "CCA": 16.9, "CCG": 6.9,
    "ACU": 13.1, "ACC": 18.9, "ACA": 15.1, "ACG": 6.1,
    "GCU": 18.4, "GCC": 27.7, "GCA": 15.8, "GCG": 7.4,
    "UAU": 12.2, "UAC": 15.3, "UAA": 1.0, "UAG": 0.8,
    "CAU": 10.9, "CAC": 15.1, "CAA": 12.3, "CAG": 34.2,
    "AAU": 17.0, "AAC": 19.1, "AAA": 24.4, "AAG": 31.9,
    "GAU": 21.8, "GAC": 25.1, "GAA": 29.0, "GAG": 39.6,
    "UGU": 10.6, "UGC": 12.6, "UGA": 1.6, "UGG": 13.2,
    "CGU": 4.5, "CGC": 10.4, "CGA": 6.2, "CGG": 11.4,
    "AGU": 12.1, "AGC": 19.5, "AGA": 12.2, "AGG": 12.0,
    "GGU": 10.8, "GGC": 22.2, "GGA": 16.5, "GGG": 16.5
}

aa_table = {
    'F': ['UUU', 'UUC'], 
    'L': ['UUA','UUG','CUU','CUC','CUA','CUG'],
    'I': ['AUU','AUC','AUA'], 'M': ['AUG'], 
    'V': ['GUU','GUC','GUA','GUG'],
    'S': ['UCU','UCC','UCA','UCG','AGU','AGC'], 
    'P': ['CCU','CCC','CCA','CCG'],
    'T': ['ACU','ACC','ACA','ACG'], 
    'A': ['GCU','GCC','GCA','GCG'],
    'Y': ['UAU','UAC'], 
    'H': ['CAU','CAC'], 
    'Q': ['CAA','CAG'],
    'N': ['AAU','AAC'], 
    'K': ['AAA','AAG'], 
    'D': ['GAU','GAC'],
    'E': ['GAA','GAG'], 
    'C': ['UGU','UGC'], 
    'W': ['UGG'],
    'R': ['CGU','CGC','CGA','CGG','AGA','AGG'], 
    'G': ['GGU','GGC','GGA','GGG']
}

# Codon weights
codon_weights = {}
for aa, codons in aa_table.items():
    mean_freq = sum(kazusa_freq[c] for c in codons if c in kazusa_freq) / len(codons)
    rscu = {c: kazusa_freq[c] / mean_freq for c in codons if c in kazusa_freq}
    max_rscu = max(rscu.values())
    for c in rscu:
        codon_weights[c] = rscu[c] / max_rscu

# Compute CAI
def compute_cai(seq):
    if pd.isna(seq) or not seq:
        return None
    seq = seq.upper().replace("T","U")
    codons = [seq[i:i+3] for i in range(0, len(seq), 3) if len(seq[i:i+3])==3]
    codons = [c for c in codons if c not in ["UAA","UAG","UGA"]]  # skip stop codons
    ws = [codon_weights[c] for c in codons if c in codon_weights]
    if not ws:
        return None
    return exp(sum(log(w) for w in ws) / len(ws))

# Apply to each file

# Codon97
codon97_df = pd.read_csv("data/codon97_with_ends.csv")
codon97_df["CAI_5end"] = codon97_df["CDS_5end"].apply(compute_cai)
codon97_df["CAI_3end"] = codon97_df["CDS_3end"].apply(compute_cai)
codon97_df.to_csv("data/codon97_with_ends.csv", index=False)

# RNA-seq
rna_df = pd.read_csv("data/rna_with_ends.csv")
rna_df["CAI_5end"] = rna_df["CDS_5end"].apply(compute_cai)
rna_df["CAI_3end"] = rna_df["CDS_3end"].apply(compute_cai)
rna_df.to_csv("data/rna_with_ends.csv", index=False)

# Proteomics
protein_df = pd.read_csv("data/protein_with_ends.csv")
protein_df["CAI_5end"] = protein_df["CDS_5end"].apply(compute_cai)
protein_df["CAI_3end"] = protein_df["CDS_3end"].apply(compute_cai)
protein_df.to_csv("data/protein_with_ends.csv", index=False)

# All prot (from FASTA)
allprot_df = pd.read_csv("data/allprot_with_ends.csv")
allprot_df["CAI_5end"] = allprot_df["CDS_5end"].apply(compute_cai)
allprot_df["CAI_3end"] = allprot_df["CDS_3end"].apply(compute_cai)
allprot_df.to_csv("data/allprot_with_ends.csv", index=False)

#### Plot for 5' 

In [None]:
codon97_df = pd.read_csv("data/codon97_with_ends.csv")
rna_df     = pd.read_csv("data/rna_with_ends.csv")
protein_df = pd.read_csv("data/protein_with_ends.csv")
allprot_df = pd.read_csv("data/allprot_with_ends.csv")

#### Bootstrap func

In [None]:
import numpy as np
import pandas as pd

def bootstrap_pvalue(test_vals, all_vals, n_iter=1000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    pval = np.mean(np.abs(boot_means - np.mean(all_vals)) >= 
                   np.abs(test_mean - np.mean(all_vals)))

    return test_mean, np.mean(all_vals), pval

codon97_cai = codon97_df["CAI_5end"].dropna().values
rna_cai     = rna_df["CAI_5end"].dropna().values
protein_cai = protein_df["CAI_5end"].dropna().values
all_cai     = allprot_df["CAI_5end"].dropna().values  

results_mean = {}
for label, values in {
    "Codon97": codon97_cai,
    "RNAseq": rna_cai,
    "Proteomics": protein_cai
}.items():
    results_mean[label] = bootstrap_pvalue(values, all_cai, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_Allprot","p-value"]).T
print("=== Mean comparison to AllProt ===")
print(res_df_mean)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import matplotlib as mpl

# Load the dataset
codon97_df = pd.read_csv("data/codon97_with_ends.csv")
rna_df     = pd.read_csv("data/rna_with_ends.csv")
protein_df = pd.read_csv("data/protein_with_ends.csv")
allprot_df = pd.read_csv("data/allprot_with_ends.csv")

# Combine data
all_data_5 = pd.DataFrame({
    "CAI_5end": pd.concat([
        codon97_df["CAI_5end"],
        rna_df["CAI_5end"],
        protein_df["CAI_5end"],
        allprot_df["CAI_5end"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"] * len(codon97_df) +
        ["RNAseq_Log2FC>0"] * len(rna_df) +
        ["Proteomics_Log2FC>0"] * len(protein_df) +
        ["All_otherprot_all_Log2FC"] * len(allprot_df)
    )
})

custom_palette = {
    "Codon97": "#8DD3C7", 
    "Proteomics_Log2FC>0": "#f6f6bc",               
    "RNAseq_Log2FC>0": "#c2bed6",
    "All_otherprot_all_Log2FC": "#eab375"  
}

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = all_data_5.groupby("Source").size()

# ===== Make violin plot =====
plt.figure(figsize=(10,6))

ax = sns.violinplot(
    x="Source", y="CAI_5end", data=all_data_5,
    inner=None, palette=custom_palette, order=order, linewidth=1.2
)

# Jittered stripplot
sns.stripplot(
    x="Source", y="CAI_5end", data=all_data_5,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# Boxplot inside violin
sns.boxplot(
    x="Source", y="CAI_5end", data=all_data_5,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# Add group means (white dot)
group_means = all_data_5.groupby("Source")["CAI_5end"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]+0.01, f"mean = {group_means[src]:.4f}", 
            ha='center', va='bottom', fontsize=10, color="blue")

# p-values
p_values = {
    "Codon97": 0.982,
    "RNAseq_Log2FC>0": 0.435,
    "Proteomics_Log2FC>0": 0.001
}

y_max = all_data_5["CAI_5end"].max()

# distance between each p-value lines
y_offset = 0.04
for i, src in enumerate(order[:-1]):  
    x1, x2 = i, len(order)-1  # compare each to AllProt
    pval = p_values.get(src, None)
    if pval is None:
        continue
    y = y_max + (i+1)*y_offset
    plt.plot([x1, x1, x2, x2], [y, y+0.005, y+0.005, y], lw=1.2, c='k')
    plt.text((x1+x2)/2, y+0.008, f"p = {pval:.3f}", ha='center', va='bottom', fontsize=10, color="blue")

# Add counts to x-axis
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

# Y axis settings
plt.yticks(np.arange(0.5, 1.05, 0.05))
plt.ylim(all_data_5["CAI_5end"].min()-0.1, all_data_5["CAI_5end"].max()+0.2)
plt.title("Distribution of 5′ CAI Across Sources", fontsize=14)
plt.ylabel("CAI (5′ End)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()

import matplotlib as mpl
from pathlib import Path

# keep text selectable in SVG (not converted to paths)
mpl.rcParams['svg.fonttype'] = 'none'

# make sure the folder exists
outdir = Path("data/figs")
outdir.mkdir(parents=True, exist_ok=True)

# save as SVG (vector)
plt.savefig(outdir / "CAI_5'_Dist.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()


#### Plot for 3'

In [None]:
codon97_cai = codon97_df["CAI_3end"].dropna().values
rna_cai     = rna_df["CAI_3end"].dropna().values
protein_cai = protein_df["CAI_3end"].dropna().values
all_cai     = allprot_df["CAI_3end"].dropna().values   # reference

results_mean = {}
for label, values in {
    "Codon97": codon97_cai,
    "RNAseq": rna_cai,
    "Proteomics": protein_cai
}.items():
    results_mean[label] = bootstrap_pvalue(values, all_cai, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_Allprot","p-value"]).T
print("=== Mean comparison to AllProt ===")
print(res_df_mean)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import matplotlib as mpl

# ===== Load four datasets with CAI_3end =====
codon97_df = pd.read_csv("data/codon97_with_ends.csv")
rna_df     = pd.read_csv("data/rna_with_ends.csv")
protein_df = pd.read_csv("data/protein_with_ends.csv")
allprot_df = pd.read_csv("data/allprot_with_ends.csv")

# ===== Combine data =====
all_data_5 = pd.DataFrame({
    "CAI_3end": pd.concat([
        codon97_df["CAI_3end"],
        rna_df["CAI_3end"],
        protein_df["CAI_3end"],
        allprot_df["CAI_3end"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"] * len(codon97_df) +
        ["RNAseq_Log2FC>0"] * len(rna_df) +
        ["Proteomics_Log2FC>0"] * len(protein_df) +
        ["All_otherprot_all_Log2FC"] * len(allprot_df)
    )
})

custom_palette = {
    "Codon97": "#8DD3C7", 
    "Proteomics_Log2FC>0": "#f6f6bc",               
    "RNAseq_Log2FC>0": "#c2bed6",
    "All_otherprot_all_Log2FC": "#eab375"  
}

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = all_data_5.groupby("Source").size()

# ===== Make violin plot =====
plt.figure(figsize=(10,6))

ax = sns.violinplot(
    x="Source", y="CAI_3end", data=all_data_5,
    inner=None, palette=custom_palette, order=order, linewidth=1.2
)

# Jittered stripplot
sns.stripplot(
    x="Source", y="CAI_3end", data=all_data_5,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# Boxplot inside violin
sns.boxplot(
    x="Source", y="CAI_3end", data=all_data_5,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# Add group means (white dot)
group_means = all_data_5.groupby("Source")["CAI_3end"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]+0.01, f"mean = {group_means[src]:.4f}", 
            ha='center', va='bottom', fontsize=10, color="blue")

# p-values
p_values = {
    "Codon97": 0.008,
    "RNAseq_Log2FC>0": 0.135,
    "Proteomics_Log2FC>0": 0.008
}

y_max = all_data_5["CAI_3end"].max()

# distance between each p-value lines
y_offset = 0.04
for i, src in enumerate(order[:-1]):  
    x1, x2 = i, len(order)-1  # compare each to AllProt
    pval = p_values.get(src, None)
    if pval is None:
        continue
    y = y_max + (i+1)*y_offset
    plt.plot([x1, x1, x2, x2], [y, y+0.005, y+0.005, y], lw=1.2, c='k')
    plt.text((x1+x2)/2, y+0.008, f"p = {pval:.3f}", ha='center', va='bottom', fontsize=10, color="blue")

# Add counts to x-axis
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

# Y axis settings
plt.yticks(np.arange(0.5, 1.05, 0.05))
plt.ylim(all_data_5["CAI_3end"].min()-0.1, all_data_5["CAI_3end"].max()+0.2)
plt.title("Distribution of 3' CAI Across Sources", fontsize=14)
plt.ylabel("CAI (3' End)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()

import matplotlib as mpl
from pathlib import Path

# keep text selectable in SVG (not converted to paths)
mpl.rcParams['svg.fonttype'] = 'none'

# make sure the folder exists
outdir = Path("data/figs")
outdir.mkdir(parents=True, exist_ok=True)

# save as SVG (vector)
plt.savefig(outdir / "CAI_3'_Dist.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()
