# GBA-PD Proteomics analysis (corrected for covariates)
Simple analysis for cathepsin proteins is on "GBA_PD_proteomics_prep.ipynb"

## 1. Data prep

In [None]:
# Rescuing patient information
import os
import pandas as pd
import numpy as np

GBA1_PD_proteo = pd.read_csv("/mnt/project/GBA1_PD_proteo.txt", sep="\t")
GBA1_Ctrl_proteo = pd.read_csv("/mnt/project/GBA1_Ctrl_proteo.txt", sep="\t")

GBA1_PD_proteo.head()

In [None]:
# Rescuing protein abundances
proteomics = pd.read_csv("/mnt/project/proteomics_results/complete_proteomics_df.txt", sep="\t") # also in DNAnexus:proteomics_results
proteomics.head(5)

In [None]:
# Look for CTS proteins
proteomics.columns = proteomics.columns.str.replace("olink_instance_0.", "", case=False)

matches2 = [col for col in proteomics.columns if col.lower().startswith("cts")]
matches2

In [None]:
GBA1_PD_abund = proteomics[proteomics['eid'].isin(GBA1_PD_proteo['IID'])]
GBA1_Ctrl_abund = proteomics[proteomics['eid'].isin(GBA1_Ctrl_proteo['IID'])]

In [None]:
len(GBA1_PD_abund)

In [None]:
len(GBA1_Ctrl_abund)

In [None]:
GBA1_PD_abund_cts = GBA1_PD_abund[ ["eid"] + matches2 ]
GBA1_PD_abund_cts.head()

In [None]:
GBA1_Ctrl_abund_cts = GBA1_Ctrl_abund[ ["eid"] + matches2 ]
GBA1_Ctrl_abund_cts.head()

In [None]:
# Merge metadata information (age at baseline and sex)
metadata = pd.read_csv("/mnt/project/proteomics_results/age_sex_proteomics_df.txt", sep="\t") # also in DNAnexus:proteomics_results
metadata.head(5)

In [None]:
GBA1_PD_abund_cts = metadata.merge(GBA1_PD_abund_cts, left_on="participant.eid", right_on="eid")
GBA1_PD_abund_cts.head()

In [None]:
GBA1_Ctrl_abund_cts = metadata.merge(GBA1_Ctrl_abund_cts, left_on="participant.eid", right_on="eid")
GBA1_Ctrl_abund_cts.head()

In [None]:
len(GBA1_PD_abund_cts)

In [None]:
len(GBA1_Ctrl_abund_cts)

## 2. Differential expression, corrected for covariates
Used multiple linear regression, only with cathepsin (CTS) proteins

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests

# --------------------------------------------------------
# 1. Build a unified dataframe with covariates + proteins
# --------------------------------------------------------

# Add phenotype columns to your PD and Control dataframes
GBA1_PD_abund_cts["case_status"]  = 1
GBA1_Ctrl_abund_cts["case_status"] = 0

# Make single combined dataframe
df = pd.concat([GBA1_PD_abund_cts, GBA1_Ctrl_abund_cts], ignore_index=True)

# Standardize column names
df = df.rename(columns={
    "participant.p21003_i0": "age",
    "participant.p31": "sex",
})

# Identify protein columns
protein_cols = [c for c in df.columns 
                if c not in ["participant.eid", "eid", "case_status", "age", "sex"]]

In [None]:
# --------------------------------------------------------
# 2. Run regression for each protein
# --------------------------------------------------------

results_list = []

for protein in protein_cols:
    tmp = df[["case_status", "age", "sex", protein]].dropna()

    # y = protein
    y = tmp[protein]

    # X = case_status + age + sex
    X = tmp[["case_status", "age", "sex"]]
    X = sm.add_constant(X)

    # OLS
    model = sm.OLS(y, X).fit()

    # Extract results for the case_status coefficient
    beta = model.params["case_status"]
    pval = model.pvalues["case_status"]

    # Compute means
    mean_case = tmp.loc[tmp.case_status == 1, protein].mean()
    mean_control = tmp.loc[tmp.case_status == 0, protein].mean()

    results_list.append([protein, mean_case, mean_control, beta, pval])

# Convert to dataframe
results = pd.DataFrame(results_list, columns=[
    "protein_name", "mean_case", "mean_control", "beta_case_status", "p_value"
])

In [None]:
# --------------------------------------------------------
# 3. FDR correction
# --------------------------------------------------------

results["FDR_p_value"] = multipletests(results["p_value"], method="fdr_bh")[1]

In [None]:
# Sort by FDR
results = results.sort_values("p_value")

# Show results
results

In [None]:
# Upload final files
results.to_csv("regression_CTS_proteins_GBA-PD_vs_GBA-Ctrl_age-sex_corrected.txt", sep="\t", index=False)
!dx upload regression_CTS_proteins_GBA-PD_vs_GBA-Ctrl_age-sex_corrected.txt --destination proteomics_results/