#### GC content at 5'

In [None]:
import pandas as pd

codon97_df = pd.read_csv("data/codon97_with_ends.csv")
rna_df     = pd.read_csv("data/rna_with_ends.csv")
protein_df = pd.read_csv("data/protein_with_ends.csv")
allprot_df = pd.read_csv("data/allprot_with_ends.csv")

# to compute gc content
def gc_content(seq):
    if pd.isna(seq):
        return None
    seq = str(seq).upper()
    g = seq.count("G")
    c = seq.count("C")
    atgc = len([nt for nt in seq if nt in "ATGC"])
    return (g + c) / atgc if atgc > 0 else None

# for the 5_end
codon97_df["GC_5end"] = codon97_df["CDS_5end"].apply(gc_content)
rna_df["GC_5end"]     = rna_df["CDS_5end"].apply(gc_content)
protein_df["GC_5end"] = protein_df["CDS_5end"].apply(gc_content)
allprot_df["GC_5end"] = allprot_df["CDS_5end"].apply(gc_content)

codon97_df.to_csv("data/codon97_with_ends.csv", index=False)
rna_df.to_csv("data/rna_with_ends.csv", index=False)
protein_df.to_csv("data/protein_with_ends.csv", index=False)
allprot_df.to_csv("data/allprot_with_ends.csv", index=False)

#### Bootstrap test

In [None]:
import numpy as np
import pandas as pd


def bootstrap_pvalue_mean(test_vals, all_vals, n_iter=1000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    pval = np.mean(np.abs(boot_means - np.mean(all_vals)) >= 
                   np.abs(test_mean - np.mean(all_vals)))
    return test_mean, np.mean(all_vals), pval

codon97_gc = codon97_df["GC_5end"].dropna().values
rna_gc     = rna_df["GC_5end"].dropna().values
protein_gc = protein_df["GC_5end"].dropna().values
all_gc     = allprot_df["GC_5end"].dropna().values 


results_mean = {}
for label, values in {
    "Codon97": codon97_gc,
    "RNAseq": rna_gc,
    "Proteomics": protein_gc
}.items():
    results_mean[label] = bootstrap_pvalue_mean(values, all_gc, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_AllProt","p-value"]).T
print("GC 5'end mean")
print(res_df_mean)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

gc_data_5 = pd.DataFrame({
    "GC_5end": pd.concat([
        codon97_df["GC_5end"],
        rna_df["GC_5end"],
        protein_df["GC_5end"],
        allprot_df["GC_5end"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"] * len(codon97_df) +
        ["RNAseq_Log2FC>0"] * len(rna_df) +
        ["Proteomics_Log2FC>0"] * len(protein_df) +
        ["All_otherprot_all_Log2FC"] * len(allprot_df)
    )
})

order = ["Codon97","Proteomics_Log2FC>0","RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = gc_data_5.groupby("Source").size()

palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0": "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "All_otherprot_all_Log2FC": "#eab375"
}

p_values = {
    "Codon97": 0.261,
    "RNAseq_Log2FC>0": 0.273,
    "Proteomics_Log2FC>0": 0.799
}

# Violin plot
plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y="GC_5end", data=gc_data_5,
    inner=None, palette=palette, order=order, linewidth=1.2
)

# Add jitter
sns.stripplot(
    x="Source", y="GC_5end", data=gc_data_5,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# Add boxplots
sns.boxplot(
    x="Source", y="GC_5end", data=gc_data_5,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# Add group means
group_means = gc_data_5.groupby("Source")["GC_5end"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]+0.01, f"{group_means[src]:.3f}", 
            ha='center', va='bottom', fontsize=9, color="blue")

# Add p-value lines 
y_max = gc_data_5["GC_5end"].max()
y_step = 0.05
for i, src in enumerate(order[:-1]):
    pval = p_values.get(src, None)
    if pval is None:
        continue
    x1, x2 = i, len(order)-1
    y = y_max + (i+1)*y_step
    plt.plot([x1, x1, x2, x2], [y, y+0.01, y+0.01, y], lw=1.2, c='k')
    plt.text((x1+x2)/2, y+0.015, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")

# Counts in x labels
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)
plt.yticks(np.arange(0, 1.05, 0.1))
plt.ylim(gc_data_5["GC_5end"].min()-0.05, gc_data_5["GC_5end"].max()+0.25)
plt.title("GC Content Distribution at 5′ Ends", fontsize=14)
plt.ylabel("GC fraction (5′ end)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()

import matplotlib as mpl
from pathlib import Path
mpl.rcParams['svg.fonttype'] = 'none'

outdir = Path("data/figs")

plt.savefig(outdir / "GC_5'_Dist.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()


### GC content: 3'

In [None]:
codon97_df["GC_3end"] = codon97_df["CDS_3end"].apply(gc_content)
rna_df["GC_3end"]     = rna_df["CDS_3end"].apply(gc_content)
protein_df["GC_3end"] = protein_df["CDS_3end"].apply(gc_content)
allprot_df["GC_3end"] = allprot_df["CDS_3end"].apply(gc_content)

In [None]:
codon97_gc3 = codon97_df["GC_3end"].dropna().values
rna_gc3     = rna_df["GC_3end"].dropna().values
protein_gc3 = protein_df["GC_3end"].dropna().values
all_gc3     = allprot_df["GC_3end"].dropna().values

results_mean_3 = {}
for label, values in {
    "Codon97": codon97_gc3,
    "RNAseq_Log2FC>0": rna_gc3,
    "Proteomics_Log2FC>0": protein_gc3
}.items():
    results_mean_3[label] = bootstrap_pvalue_mean(values, all_gc3, n_iter=1000)

res_df_mean_3 = pd.DataFrame(results_mean_3, index=["Mean_Group","Mean_AllProt","p-value"]).T
print("=== Mean comparison (GC 3′ end) ===")
print(res_df_mean_3)

In [None]:
gc_data_3 = pd.DataFrame({
    "GC_3end": pd.concat([
        codon97_df["GC_3end"],
        rna_df["GC_3end"],
        protein_df["GC_3end"],
        allprot_df["GC_3end"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"] * len(codon97_df) +
        ["RNAseq_Log2FC>0"] * len(rna_df) +
        ["Proteomics_Log2FC>0"] * len(protein_df) +
        ["All_otherprot_all_Log2FC"] * len(allprot_df)
    )
})

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = gc_data_3.groupby("Source").size()

palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0": "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "All_otherprot_all_Log2FC": "#eab375"
}

p_values = {
    "Codon97": float(res_df_mean_3.loc["Codon97","p-value"]),
    "RNAseq_Log2FC>0": float(res_df_mean_3.loc["RNAseq_Log2FC>0","p-value"]),
    "Proteomics_Log2FC>0": float(res_df_mean_3.loc["Proteomics_Log2FC>0","p-value"])
}

# Violin Plot
plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y="GC_3end", data=gc_data_3,
    inner=None, palette=palette, order=order, linewidth=1.2
)

# Add jitter
sns.stripplot(
    x="Source", y="GC_3end", data=gc_data_3,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# Add boxplots
sns.boxplot(
    x="Source", y="GC_3end", data=gc_data_3,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# Add group means
group_means = gc_data_3.groupby("Source")["GC_3end"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]+0.01, f"{group_means[src]:.3f}", 
            ha='center', va='bottom', fontsize=9, color="blue")

# Add p-value lines (each vs AllProt)
y_max = gc_data_3["GC_3end"].max()
y_step = 0.05
for i, src in enumerate(order[:-1]):
    pval = p_values.get(src, None)
    if pval is None:
        continue
    x1, x2 = i, len(order)-1
    y = y_max + (i+1)*y_step
    plt.plot([x1, x1, x2, x2], [y, y+0.01, y+0.01, y], lw=1.2, c='k')
    plt.text((x1+x2)/2, y+0.015, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")

# Counts in x labels
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

plt.ylim(gc_data_3["GC_3end"].min()-0.05, gc_data_3["GC_3end"].max()+0.25)
plt.title("GC Content Distribution at 3′ Ends", fontsize=14)
plt.ylabel("GC fraction (3′ end)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()

import matplotlib as mpl
from pathlib import Path
mpl.rcParams['svg.fonttype'] = 'none'

outdir = Path("data/figs")

plt.savefig(outdir / "GC_3'_Dist.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()


#### GC content for all

In [None]:
codon97_df["GC_full"] = codon97_df["CDS"].apply(gc_content)
rna_df["GC_full"]     = rna_df["CDS"].apply(gc_content)
protein_df["GC_full"] = protein_df["CDS"].apply(gc_content)
allprot_df["GC_full"] = allprot_df["CDS"].apply(gc_content)

In [None]:
codon97_gc_full = codon97_df["GC_full"].dropna().values
rna_gc_full     = rna_df["GC_full"].dropna().values
protein_gc_full = protein_df["GC_full"].dropna().values
all_gc_full     = allprot_df["GC_full"].dropna().values 

results_mean_full = {}
for label, values in {
    "Codon97": codon97_gc_full,
    "RNAseq": rna_gc_full,
    "Proteomics": protein_gc_full
}.items():
    results_mean_full[label] = bootstrap_pvalue_mean(values, all_gc_full, n_iter=1000)

res_df_mean_full = pd.DataFrame(results_mean_full, index=["Mean_Group","Mean_AllProt","p-value"]).T
print("=== Mean comparison (GC full CDS) ===")
print(res_df_mean_full)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

gc_data_full = pd.DataFrame({
    "GC_full": pd.concat([
        codon97_df["GC_full"],
        rna_df["GC_full"],
        protein_df["GC_full"],
        allprot_df["GC_full"]
    ], ignore_index=True),
    "Source": (
        ["Codon97"] * len(codon97_df) +
        ["RNAseq_Log2FC>0"] * len(rna_df) +
        ["Proteomics_Log2FC>0"] * len(protein_df) +
        ["All_otherprot_all_Log2FC"] * len(allprot_df)
    )
})

order = ["Codon97","Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = gc_data_full.groupby("Source").size()

palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0": "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "All_otherprot_all_Log2FC": "#eab375"
}

p_values = {
    "Codon97": float(res_df_mean_full.loc["Codon97","p-value"]),
    "RNAseq_Log2FC>0": float(res_df_mean_full.loc["RNAseq","p-value"]),
    "Proteomics_Log2FC>0": float(res_df_mean_full.loc["Proteomics","p-value"])
}

# Violin Plot
plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y="GC_full", data=gc_data_full,
    inner=None, palette=palette, order=order, linewidth=1.2
)

# Add jitter
sns.stripplot(
    x="Source", y="GC_full", data=gc_data_full,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# Add boxplots
sns.boxplot(
    x="Source", y="GC_full", data=gc_data_full,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# Add group means
group_means = gc_data_full.groupby("Source")["GC_full"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]+0.01, f"{group_means[src]:.3f}", 
            ha='center', va='bottom', fontsize=9, color="blue")

# Add p-value lines (each vs AllProt)
y_max = gc_data_full["GC_full"].max()
y_step = 0.05
for i, src in enumerate(order[:-1]):
    pval = p_values.get(src, None)
    if pval is None:
        continue
    x1, x2 = i, len(order)-1
    y = y_max + (i+1)*y_step
    plt.plot([x1, x1, x2, x2], [y, y+0.01, y+0.01, y], lw=1.2, c='k')
    plt.text((x1+x2)/2, y+0.015, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")

# Counts in x labels
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

plt.ylim(gc_data_full["GC_full"].min()-0.05, gc_data_full["GC_full"].max()+0.25)
plt.title("GC Content Distribution (Full CDS)", fontsize=14)
plt.ylabel("GC fraction (full CDS)", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()

import matplotlib as mpl
from pathlib import Path

mpl.rcParams['svg.fonttype'] = 'none'

outdir = Path("data/figs")

plt.savefig(outdir / "GC_Dist_Full.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()
