In [None]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict

gene_intron_seq = defaultdict(list)

for record in SeqIO.parse("data/introns_9056.fa", "fasta"):
    gene_id = record.id.split("|")[0]  
    gene_intron_seq[gene_id].append(str(record.seq))

# put the introns for 1 gene together
for g in gene_intron_seq:
    gene_intron_seq[g] = "".join(gene_intron_seq[g])

def extract_parts(seq, n=150):
    if not seq:
        return None, None, None
    five = seq[:n] if len(seq) >= n else seq
    three = seq[-n:] if len(seq) >= n else seq
    return five, three, seq

def add_intron_parts(df, ensg_col="ENSG_clean"):
    df = df.copy()
    fives, threes, fulls = [], [], []
    for ensg in df[ensg_col]:
        seq = gene_intron_seq.get(ensg, None)
        five, three, full = extract_parts(seq, 150)
        fives.append(five)
        threes.append(three)
        fulls.append(full)
    df["Intron_5end"] = fives
    df["Intron_3end"] = threes
    df["Intron_full"] = fulls
    return df

codon97_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
rna_df     = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")
prot_df    = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")
all_df     = pd.read_csv("data/RNA_Proteomics_Merged.csv")

# Uset the ensg to map

rna_df["ENSG_clean"] = rna_df["ESGN"].str.replace(r"\.\d+$", "", regex=True)
prot_df["ENSG_clean"] = prot_df["ESGN"].str.replace(r"\.\d+$", "", regex=True)
all_df["ENSG_clean"] = all_df["Unnamed: 0"].str.replace(r"\.\d+$", "", regex=True)

codon97_with_intron = add_intron_parts(codon97_df, ensg_col="ESGN_clean")
rna_with_intron     = add_intron_parts(rna_df, ensg_col="ENSG_clean")
prot_with_intron    = add_intron_parts(prot_df, ensg_col="ENSG_clean")
all_with_intron     = add_intron_parts(all_df, ensg_col="ENSG_clean")

codon97_with_intron.to_csv("data/codon97_intron_seq.csv", index=False)
rna_with_intron.to_csv("data/rna_intron_seq.csv", index=False)
prot_with_intron.to_csv("data/prot_intron_seq.csv", index=False)
all_with_intron.to_csv("data/all_intron_seq.csv", index=False)


#### Get the GC content

In [None]:
def gc_content(seq):
    if pd.isna(seq) or not isinstance(seq, str):
        return None
    seq = seq.upper()
    gc = seq.count("G") + seq.count("C")
    return gc / len(seq) if len(seq) > 0 else None

def add_gc_content(df):
    df = df.copy()
    df["GC_5end"] = df["Intron_5end"].apply(gc_content)
    df["GC_3end"] = df["Intron_3end"].apply(gc_content)
    df["GC_full"] = df["Intron_full"].apply(gc_content)
    return df

codon97_df = pd.read_csv("data/codon97_intron_seq.csv")
rna_df     = pd.read_csv("data/rna_intron_seq.csv")
prot_df    = pd.read_csv("data/prot_intron_seq.csv")
all_df     = pd.read_csv("data/all_intron_seq.csv")

codon97_gc = add_gc_content(codon97_df)
rna_gc     = add_gc_content(rna_df)
prot_gc    = add_gc_content(prot_df)
all_gc     = add_gc_content(all_df)

codon97_gc.to_csv("data/codon97_intron_seq.csv", index=False)
rna_gc.to_csv("data/rna_intron_seq.csv", index=False)
prot_gc.to_csv("data/prot_intron_seq.csv", index=False)
all_gc.to_csv("data/all_intron_seq.csv", index=False)

#### Plot for 5'end

In [None]:
import numpy as np
import pandas as pd

def bootstrap_pvalue_mean(test_vals, all_vals, n_iter=1000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    pval = np.mean(np.abs(boot_means - np.mean(all_vals)) >= 
                   np.abs(test_mean - np.mean(all_vals)))
    return test_mean, np.mean(all_vals), pval

codon97_df = pd.read_csv("data/codon97_intron_seq.csv")
rna_df     = pd.read_csv("data/rna_intron_seq.csv")
prot_df    = pd.read_csv("data/prot_intron_seq.csv")
all_df     = pd.read_csv("data/all_intron_seq.csv")

# column with GC at 5' end
col = "GC_5end"

codon97_gc = codon97_df[col].dropna().values
rna_gc     = rna_df[col].dropna().values
prot_gc    = prot_df[col].dropna().values
all_gc     = all_df[col].dropna().values  

results_mean = {}
for label, values in {
    "Codon97": codon97_gc,
    "RNAseq": rna_gc,
    "Proteomics": prot_gc
}.items():
    results_mean[label] = bootstrap_pvalue_mean(values, all_gc, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_AllProt","p-value"]).T
print("=== Mean comparison (GC_5end per gene) ===")
print(res_df_mean)

#### Plot 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

codon97_df = pd.read_csv("data/codon97_intron_seq.csv")
rna_df     = pd.read_csv("data/rna_intron_seq.csv")
prot_df    = pd.read_csv("data/prot_intron_seq.csv")
all_df     = pd.read_csv("data/all_intron_seq.csv")

col = "GC_5end"   

codon97_df["Source"] = "Codon97"
rna_df["Source"]     = "RNAseq_Log2FC>0"
prot_df["Source"]    = "Proteomics_Log2FC>0"
all_df["Source"]     = "All_otherprot_all_Log2FC"

all_data = pd.concat([
    codon97_df[["Source", col]],
    rna_df[["Source", col]],
    prot_df[["Source", col]],
    all_df[["Source", col]]
], ignore_index=True)
all_data = all_data.dropna(subset=[col])

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = all_data.groupby("Source").size()

p_values = {
    "Codon97": 0.020,
    "RNAseq_Log2FC>0": 0.069,
    "Proteomics_Log2FC>0": 0.873
}

custom_palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0":  "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "All_otherprot_all_Log2FC": "#eab375"
}

plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y=col, data=all_data,
    inner=None, palette=custom_palette, order=order, linewidth=1.2
)

# jittered points
sns.stripplot(
    x="Source", y=col, data=all_data,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# boxplots
sns.boxplot(
    x="Source", y=col, data=all_data,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# group means 
group_means = all_data.groupby("Source")[col].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]*1.02, f"{group_means[src]:.3f}", 
            ha='center', va='bottom', fontsize=10, color="blue")

y_max = all_data[col].max()
y_min = all_data[col].min()
y_range = y_max - y_min
y_offset = y_range * 0.15  

for i, src in enumerate(order[:-1]):
    x1, x2 = i, len(order)-1
    pval = p_values.get(src, None)
    if pval is None:
        continue
    
    y = y_max + (i+1)*y_offset
    line_height = y + 0.02*y_range
    
    plt.plot([x1, x1, x2, x2], [y, line_height, line_height, y],
             lw=1.2, c='k')
    plt.text((x1+x2)/2, line_height*1.01, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")
s
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)
plt.yticks(np.arange(0, 1.6, 0.2))
plt.title("Distribution of GC Content at 5′ End", fontsize=14)
plt.ylabel("GC content at 5′ end", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()


import matplotlib as mpl
from pathlib import Path
mpl.rcParams['svg.fonttype'] = 'none'

outdir = Path("data/figs")
plt.savefig(outdir / "Intron_GC_5end.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()


### Plot for 3end

In [None]:
col = "GC_3end"

all_vals = all_df[col].dropna().values

results_mean = {}
for label, df in {
    "Codon97": codon97_df,
    "RNAseq": rna_df,
    "Proteomics": prot_df
}.items():
    vals = df[col].dropna().values
    results_mean[label] = bootstrap_pvalue_mean(vals, all_vals, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_AllProt","p-value"]).T
print("=== Mean comparison (GC content at 3′ end) ===")
print(res_df_mean)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

codon97_df = pd.read_csv("data/codon97_intron_seq.csv")
rna_df     = pd.read_csv("data/rna_intron_seq.csv")
prot_df    = pd.read_csv("data/prot_intron_seq.csv")
all_df     = pd.read_csv("data/all_intron_seq.csv")

col = "GC_3end"   

codon97_df["Source"] = "Codon97"
rna_df["Source"]     = "RNAseq_Log2FC>0"
prot_df["Source"]    = "Proteomics_Log2FC>0"
all_df["Source"]     = "All_otherprot_all_Log2FC"

all_data = pd.concat([
    codon97_df[["Source", col]],
    rna_df[["Source", col]],
    prot_df[["Source", col]],
    all_df[["Source", col]]
], ignore_index=True)
all_data = all_data.dropna(subset=[col])

order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = all_data.groupby("Source").size()

p_values = {
    "Codon97": 0.044,
    "RNAseq_Log2FC>0": 0.220,
    "Proteomics_Log2FC>0": 0.000
}

custom_palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0":  "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "All_otherprot_all_Log2FC": "#eab375"
}

plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y=col, data=all_data,
    inner=None, palette=custom_palette, order=order, linewidth=1.2
)

# jittered points
sns.stripplot(
    x="Source", y=col, data=all_data,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# boxplots
sns.boxplot(
    x="Source", y=col, data=all_data,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# group means
group_means = all_data.groupby("Source")[col].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]*1.02, f"{group_means[src]:.3f}", 
            ha='center', va='bottom', fontsize=10, color="blue")

y_max = all_data[col].max()
y_min = all_data[col].min()
y_range = y_max - y_min
y_offset = y_range * 0.12 

for i, src in enumerate(order[:-1]):
    x1, x2 = i, len(order)-1
    pval = p_values.get(src, None)
    if pval is None:
        continue
    
    y = y_max + (i+1)*y_offset
    line_height = y + 0.02*y_range
    
    plt.plot([x1, x1, x2, x2], [y, line_height, line_height, y],
             lw=1.2, c='k')
    plt.text((x1+x2)/2, line_height*1.01, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")


plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)
plt.yticks(np.arange(0, 1.6, 0.2))
plt.title("Distribution of GC Content at 3′ End", fontsize=14)
plt.ylabel("GC content at 3′ end", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()


import matplotlib as mpl
from pathlib import Path

mpl.rcParams['svg.fonttype'] = 'none'
outdir = Path("data/figs")

plt.savefig(outdir / "Intron_GC_3end.svg", format="svg", bbox_inches="tight", facecolor="white")


plt.show()


#### Plot for all

In [None]:
col = "GC_full"

all_vals = all_df[col].dropna().values

results_mean = {}
for label, df in {
    "Codon97": codon97_df,
    "RNAseq": rna_df,
    "Proteomics": prot_df
}.items():
    vals = df[col].dropna().values
    results_mean[label] = bootstrap_pvalue_mean(vals, all_vals, n_iter=1000)

res_df_mean = pd.DataFrame(results_mean, index=["Mean_Group","Mean_AllProt","p-value"]).T
print("=== Mean comparison (GC content at full intron seq) ===")
print(res_df_mean)


In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

codon97_df = pd.read_csv("data/codon97_intron_seq.csv")
rna_df     = pd.read_csv("data/rna_intron_seq.csv")
prot_df    = pd.read_csv("data/prot_intron_seq.csv")
all_df     = pd.read_csv("data/all_intron_seq.csv")

col = "GC_full"   

codon97_df["Source"] = "Codon97"
rna_df["Source"]     = "RNAseq_Log2FC>0"
prot_df["Source"]    = "Proteomics_Log2FC>0"
all_df["Source"]     = "All_otherprot_all_Log2FC"

all_data = pd.concat([
    codon97_df[["Source", col]],
    rna_df[["Source", col]],
    prot_df[["Source", col]],
    all_df[["Source", col]]
], ignore_index=True)
all_data = all_data.dropna(subset=[col])


order = ["Codon97", "Proteomics_Log2FC>0", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]
counts = all_data.groupby("Source").size()


p_values = {
    "Codon97": 0.943,
    "RNAseq_Log2FC>0": 0.297,
    "Proteomics_Log2FC>0": 0.001
}

custom_palette = {
    "Codon97": "#8DD3C7",
    "RNAseq_Log2FC>0":  "#c2bed6",
    "Proteomics_Log2FC>0": "#f6f6bc",
    "All_otherprot_all_Log2FC": "#eab375"
}

# violin plot
plt.figure(figsize=(10,6))
ax = sns.violinplot(
    x="Source", y=col, data=all_data,
    inner=None, palette=custom_palette, order=order, linewidth=1.2
)

# jittered points
sns.stripplot(
    x="Source", y=col, data=all_data,
    color="black", alpha=0.2, jitter=0.25, size=1.5, order=order
)

# boxplots
sns.boxplot(
    x="Source", y=col, data=all_data,
    width=0.15, showcaps=True, order=order,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

# group means 
group_means = all_data.groupby("Source")[col].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", zorder=3, s=30, edgecolors="black")
    ax.text(i, group_means[src]*1.02, f"{group_means[src]:.3f}", 
            ha='center', va='bottom', fontsize=10, color="blue")

y_max = all_data[col].max()
y_min = all_data[col].min()
y_range = y_max - y_min
y_offset = y_range * 0.1  

for i, src in enumerate(order[:-1]):
    x1, x2 = i, len(order)-1
    pval = p_values.get(src, None)
    if pval is None:
        continue
    
    y = y_max + (i+1)*y_offset
    line_height = y + 0.01*y_range
    
    plt.plot([x1, x1, x2, x2], [y, line_height, line_height, y],
             lw=1.2, c='k')
    plt.text((x1+x2)/2, line_height*1.01, f"p = {pval:.3f}",
             ha='center', va='bottom', fontsize=10, color="blue")

plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)
plt.yticks(np.arange(0, 1.3, 0.2))
plt.title("Distribution of GC Content at full sequence", fontsize=14)
plt.ylabel("GC content at full sequence", fontsize=12)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()

import matplotlib as mpl
from pathlib import Path

mpl.rcParams['svg.fonttype'] = 'none'

outdir = Path("data/figs")
plt.savefig(outdir / "Intron_GC_full.svg", format="svg", bbox_inches="tight", facecolor="white")

plt.show()
