### Find avg. control group value

In [None]:
import pandas as pd
prot_seq_df = pd.read_csv("data/RNA_Proteomics_Proteinlevel.csv")
rna_seq_df = pd.read_csv("data/RNA_Proteomics_RNAlevel.csv")
prot_counts = pd.read_csv("data/Proteomics Counts.csv")
rna_counts = pd.read_csv("data/RNA Seq Counts.csv")

In [None]:
# Define control columns
ctrl_cols = [
    "Ctrl guide 1 replicate 1",
    "Ctrl guide 1 replicate 2",
    "Ctrl guide 1 replicate 3",
    "Ctrl guide 2 replicate 1",
    "Ctrl guide 2 replicate 2",
    "Ctrl guide 2 replicate 3"
]

# Append new coln
prot_counts["average RNA expression"] = prot_counts[ctrl_cols].mean(axis=1)
rna_counts["average protein expression"] = rna_counts[ctrl_cols].mean(axis=1)

prot_counts.to_csv("data/Proteomics Counts.csv", index=False)
rna_counts.to_csv("data/RNA Seq Counts.csv", index=False)

### Step1: RNA Expression level of Proteomics

#### File need: to merge the 455 proteomics data(after filtered) with the proteomics counts to get the avg.rna expression level using their ProteinNames

In [None]:
# Merge on protein name (EntryName vs PG.ProteinNames)
merged_df = prot_seq_df.merge(
    prot_counts,
    left_on="EntryName",
    right_on="PG.ProteinNames",
    how="inner"  
)

# Save
out_file = "data/Proteomics_Proteinlevel_withCounts.csv"
merged_df.to_csv(out_file, index=False)

### Step2: Protein expression level of RNA seq

In [None]:
rna_counts = rna_counts.rename(columns={"Unnamed: 0": "ENSG"})

# Merge on ENSG 
merged_df = rna_seq_df.merge(
    rna_counts,
    left_on="ENSG",
    right_on="ENSG",
    how="inner"   # only keep matches;
)

# Save
out_file = "data/Proteomics_RNAlevel_withCounts.csv"
merged_df.to_csv(out_file, index=False)

### Step3: Expression of the 97 codons

#### Proteomics with codon 97

In [None]:
codon97_df = pd.read_csv("data/RNA_Proteomics_Filtered.csv")
merged_df = codon97_df.merge(
    prot_counts,
    left_on="EntryName",
    right_on="PG.ProteinNames",
    how="inner"  
)

# Save
out_file = "data/Proteomics_Proteinlevel_withCounts_Codon97.csv"
merged_df.to_csv(out_file, index=False)

#### RNA seq with codon 97

In [None]:
rna_counts = rna_counts.rename(columns={"Unnamed: 0": "ENSG"})

merged_df = codon97_df.merge(
    rna_counts,
    left_on="ENSG",
    right_on="ENSG",
    how="inner"  
)

# Save
out_file = "data/Proteomics_RNAlevel_withCounts_Codon97.csv"
merged_df.to_csv(out_file, index=False)

### STEP4: All other protein level

#### All genes with Proteomics (RNA expression level), exclude the 455 proteomics(log2fc>0,q-value<0.05)

In [None]:
prot_counts = pd.read_csv("data/Proteomics Counts.csv")
prot_stats  = pd.read_csv("data/Proteomics Stats.csv")
existing    = pd.read_csv("data/Proteomics_Proteinlevel_withCounts.csv")

# Merge counts + stats
merged_new = prot_counts.merge(
    prot_stats,
    left_on="PG.ProteinNames",
    right_on="G.ProteinNames",
    how="inner"
)

# Drop rows already in existing file
existing_names = set(existing["PG.ProteinNames"])
merged_new = merged_new[~merged_new["PG.ProteinNames"].isin(existing_names)]

# Save
out_file = "data/all_prot_rna_expression.csv"
merged_new.to_csv(out_file, index=False)

#### All genes with RNA counts (protein expression level), exclude the 486 one

In [None]:
RNA_Proteomics_Merged_df = pd.read_csv("data/RNA_Proteomics_Merged.csv")
rna_counts = pd.read_csv("data/RNA Seq Counts.csv")
existing    = pd.read_csv("data/Proteomics_RNAlevel_withCounts.csv")

rna_counts = rna_counts.rename(columns={"Unnamed: 0": "ENSG"})
RNA_Proteomics_Merged_df = RNA_Proteomics_Merged_df.rename(columns={"Unnamed: 0": "ENSG"})

# Merge on ENSG 
merged_df = RNA_Proteomics_Merged_df.merge(
    rna_counts,
    on="ENSG",
    how="inner"
)

if "ENSG" in existing.columns:
    existing_esgn = set(existing["ENSG"])
    merged_df = merged_df[~merged_df["ENSG"].isin(existing_esgn)]

# Save
out_file = "data/all_rna_prot_expression(RNAmerged).csv"
merged_df.to_csv(out_file, index=False)

### Violin plot for RNA expression level (by proteomics stats)

#### Log transformation

In [None]:
import pandas as pd
import numpy as np

# ===== Load files =====
codon97_withcounts = pd.read_csv("/mnt/work_3/sijin/CAI/Proteomics_Proteinlevel_withCounts_Codon97.csv")
prot_withcounts    = pd.read_csv("/mnt/work_3/sijin/CAI/Proteomics_Proteinlevel_withCounts.csv")
all_prot_rnaexpression = pd.read_csv("/mnt/work_3/sijin/CAI/all_prot_rna_expression.csv")

codon97_rna = codon97_withcounts["average RNA expression"].dropna().values
prot_rna    = prot_withcounts["average RNA expression"].dropna().values
all_rna     = all_prot_rnaexpression["average RNA expression"].dropna().values

# ===== log10 transform（+1 to avoid 0）=====
codon97_log = np.log10(codon97_rna + 1)
prot_log    = np.log10(prot_rna + 1)
all_log     = np.log10(all_rna + 1)


In [None]:
def summarize(series):
    return pd.Series({
        "mean": np.mean(series),
        "median": np.median(series),
        "std": np.std(series),
        "min": np.min(series),
        "25%": np.percentile(series, 25),
        "50%": np.percentile(series, 50),
        "75%": np.percentile(series, 75),
        "90%": np.percentile(series, 90),
        "99%": np.percentile(series, 99),
        "max": np.max(series),
        "n": len(series)
    })

summary_log10 = pd.DataFrame({
    "Codon97": summarize(codon97_log),
    "Proteinlevel_withCounts": summarize(prot_log),
    "All_prot_rna_expression": summarize(all_log)
})

print(summary_log10)

In [None]:
import pandas as pd
import numpy as np

def bootstrap_pvalue_mean(test_vals, all_vals, n_iter=10000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    all_mean = np.mean(all_vals)

    pval = np.mean(np.abs(boot_means - all_mean) >= np.abs(test_mean - all_mean))
    return test_mean, all_mean, pval


codon97_mean, all_mean, p_cod_all = bootstrap_pvalue_mean(codon97_log, all_log)
prot_mean,   all_mean2, p_prot_all = bootstrap_pvalue_mean(prot_log, all_log)

res_df = pd.DataFrame({
    "Group": ["Codon97", "Proteinlevel_withCounts"],
    "Mean_log10_Group": [codon97_mean, prot_mean],
    "Mean_log10_All": [all_mean, all_mean2],
    "p-value": [p_cod_all, p_prot_all]
})

print(res_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import matplotlib as mpl

all_data = pd.DataFrame({
    "RNA": np.concatenate([codon97_rna, prot_rna, all_rna]),
    "Source": (["Codon97"] * len(codon97_rna) +
               ["Proteomics_Log2FC>0"] * len(prot_rna) +
               ["All_otherprot_all_Log2FC"] * len(all_rna))
})

order = ["Codon97", "Proteomics_Log2FC>0", "All_otherprot_all_Log2FC"]

custom_palette = {
    "Codon97": "#8DD3C7",                         
    "Proteomics_Log2FC>0": "#f6f6bc",         
    "All_otherprot_all_Log2FC": "#eab375"          
}

plt.figure(figsize=(10,6))
ax = sns.violinplot(
    data=all_data, x="Source", y="RNA", log_scale=True,
    order=order, inner=None, palette=custom_palette, linewidth=1.2
)
sns.stripplot(
    data=all_data, x="Source", y="RNA",
    order=order, color="black", alpha=0.25, jitter=0.25, size=1.5
)
sns.boxplot(
    data=all_data, x="Source", y="RNA",
    order=order, width=0.15, showcaps=True,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

group_means = all_data.groupby("Source")["RNA"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", edgecolor="black", zorder=3, s=20)
    ax.text(i, group_means[src]* 1.2, f"mean = {group_means[src]:.0f}", 
            ha='center', va='bottom', fontsize=10, color="blue")
   
ax.set_ylabel("Average RNA Expression (log10 scale)")

y_min, y_max = ax.get_ylim()
ax.set_ylim(0, 1e9)

pad = 0.1

# Codon97 vs All
x1, x2 = 0, 2
y = 1e8   
h = 1.3  
plt.plot([x1, x1, x2, x2],
         [y, y*h, y*h, y], lw=1.5, c='k')
plt.text((x1+x2)/2, y*h*1.05, f"p = {p_cod_all_mean:.4f}",
         ha='center', va='bottom', fontsize=11, color="blue")

# Prot vs All
x1, x2 = 1, 2
y = 1e7  
h = 1.3   
plt.plot([x1, x1, x2, x2],
         [y, y*h, y*h, y], lw=1.5, c='k')
plt.text((x1+x2)/2, y*h*1.05, f"p = {p_prot_all_mean:.4f}",
         ha='center', va='bottom', fontsize=11, color="blue")

counts = all_data.groupby("Source").size()
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

plt.title("Distribution of Average RNA Expression", fontsize=14)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()

plt.show()



### Violin plot for protein expression level (by RNA stats)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import matplotlib as mpl

# ===== Load files =====
codon97_withcounts = pd.read_csv("/mnt/work_3/sijin/CAI/Proteomics_RNAlevel_withCounts_Codon97.csv")
rna_withcounts     = pd.read_csv("/mnt/work_3/sijin/CAI/Proteomics_RNAlevel_withCounts.csv")
all_rna_prot_expr  = pd.read_csv("/mnt/work_3/sijin/CAI/all_rna_prot_expression(RNAmerged).csv")

# ===== Extract average protein expression =====
codon97_protein = codon97_withcounts["average protein expression"].dropna().astype(float).values
rna_protein     = rna_withcounts["average protein expression"].dropna().astype(float).values
all_protein     = all_rna_prot_expr["average protein expression"].dropna().astype(float).values

# ===== log10 transform (+1 to avoid log(0)) =====
codon97_log = np.log10(codon97_protein + 1)
rna_log     = np.log10(rna_protein + 1)
all_log     = np.log10(all_protein + 1)


In [None]:
# Bootstrap test on mean value

def bootstrap_pvalue_mean(test_vals, all_vals, n_iter=10000, seed=123):
    np.random.seed(seed)
    test_mean = np.mean(test_vals)
    n = len(test_vals)

    boot_means = []
    for _ in range(n_iter):
        sample = np.random.choice(all_vals, size=n, replace=False)
        boot_means.append(np.mean(sample))

    boot_means = np.array(boot_means)
    all_mean = np.mean(all_vals)

    pval = np.mean(np.abs(boot_means - all_mean) >= np.abs(test_mean - all_mean))
    return test_mean, all_mean, pval

codon97_mean, all_mean, p_cod_all = bootstrap_pvalue_mean(codon97_log, all_log)
rna_mean,     all_mean2, p_rna_all = bootstrap_pvalue_mean(rna_log, all_log)

res_df = pd.DataFrame({
    "Group": ["Codon97", "RNAseq_Log2FC>0"],
    "Mean_log10_Group": [codon97_mean, rna_mean],
    "Mean_log10_All": [all_mean, all_mean2],
    "p-value": [p_cod_all, p_rna_all]
})

print(res_df)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

all_data = pd.DataFrame({
    "Protein": np.concatenate([codon97_protein, rna_protein, all_protein]),
    "Source": (["Codon97"] * len(codon97_protein) +
               ["RNAseq_Log2FC>0"] * len(rna_protein) +
               ["All_otherprot_all_Log2FC"] * len(all_protein))
})

order = ["Codon97", "RNAseq_Log2FC>0", "All_otherprot_all_Log2FC"]

custom_palette = {
    "Codon97": "#8DD3C7",                         
    "RNAseq_Log2FC>0": "#c2bed6",         
    "All_otherprot_all_Log2FC": "#eab375"          
}

plt.figure(figsize=(10,6))
ax = sns.violinplot(
    data=all_data, log_scale=True, x="Source", y="Protein",
    order=order, inner=None, palette=custom_palette, linewidth=1.2
)
sns.stripplot(
    data=all_data, x="Source", y="Protein",
    order=order, color="black", alpha=0.25, jitter=0.25, size=1.5
)
sns.boxplot(
    data=all_data, x="Source", y="Protein",
    order=order, width=0.15, showcaps=True,
    boxprops={'facecolor':'grey', 'edgecolor':'black'},
    whiskerprops={'color':'black'},
    medianprops={'color':'black', 'linewidth':1},
    showfliers=False
)

group_means = all_data.groupby("Source")["Protein"].mean()
for i, src in enumerate(order):
    ax.scatter(i, group_means[src], color="white", edgecolor="black", zorder=3, s=20)
    ax.text(i, group_means[src]* 1.2, f"mean = {group_means[src]:.0f}", 
            ha='center', va='bottom', fontsize=10, color="blue")
    
ax.set_ylabel("Average Protein Expression (log10 scale)")
ax.set_ylim(0, 1e9)

y_min, y_max = ax.get_ylim()

pad = 0.1

# Codon97 vs All
x1, x2 = 0, 2
y = 1e8  
h = 1.3  
plt.plot([x1, x1, x2, x2],
         [y, y*h, y*h, y], lw=1.5, c='k')
plt.text((x1+x2)/2, y*h*1.05, f"p = {p_cod_all_mean:.4f}",
         ha='center', va='bottom', fontsize=11, color="blue")

# Proteinlevel vs All
x1, x2 = 1, 2
y = 1e7  
h = 1.3   
plt.plot([x1, x1, x2, x2],
         [y, y*h, y*h, y], lw=1.5, c='k')
plt.text((x1+x2)/2, y*h*1.05, f"p = {p_rna_all_mean:.4f}",
         ha='center', va='bottom', fontsize=11, color="blue")

counts = all_data.groupby("Source").size()
plt.xticks(
    ticks=range(len(order)),
    labels=[f"{src}\n(n={counts[src]})" for src in order]
)

plt.title("Distribution of Average Protein Expression", fontsize=14)
plt.xlabel("")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()


plt.show()
