In [None]:
import pandas as pd
from pathlib import Path
import json

# Motor Neuron Bulk Transcriptomics

In [None]:
mn_path = Path("../data/GSE76220_norm_counts_TPM_GRCh38.p13_NCBI.tsv")
hgnc_path = Path("../data/hgnc_19270_genes_with_protein_product.txt")

hgnc = pd.read_table(hgnc_path, sep="\t")[['symbol', 'entrez_id']]

expr = pd.read_table(mn_path, sep="\t", index_col=0)
expr = expr.drop(columns=["GSM1977034"])  # drop duplicated sample

entrez_to_symbol = hgnc.set_index("entrez_id")["symbol"]

# replace index
expr.index = expr.index.map(entrez_to_symbol)

# remove genes that didn't map (now NaN index)
expr = expr[~expr.index.isna()]

expr.to_csv("../data/mn_tpm.csv")
expr.head()

# iMG ALS upregulated ligands

In [None]:
def read_and_filter(path, sheet_idx):
    df = pd.read_excel(path, sheet_name=sheet_idx)
    gene_col = "gene_symbol"
    padj_col = "padj"
    lfc_col = "log2FoldChange"

    df[padj_col] = pd.to_numeric(df[padj_col], errors="coerce")
    df[lfc_col] = pd.to_numeric(df[lfc_col],  errors="coerce")

    filt = (df[padj_col] < 0.05) & (df[lfc_col] > 0)
    keep = df.loc[filt, gene_col].tolist()

    return set(keep)


In [None]:
mg_path = Path("../data/all_bioinformatics_results.xlsx")

mg_g29 = read_and_filter(
    mg_path,
    sheet_idx=2
)
mg_c9 = read_and_filter(
    mg_path,
    sheet_idx=3
)
mg_als = sorted(mg_g29 & mg_c9)
print(f"Number of overlapping upregulated genes: {len(mg_als)}")

with open("../data/mg_als_up_gene_list.json", "w") as f:
    json.dump(mg_als, f, indent=2)

# also save G29 and C9 upregulated genes
with open("../data/mg_g29_up_gene_list.json", "w") as f:
    json.dump(sorted(mg_g29), f, indent=2)
with open("../data/mg_c9_up_gene_list.json", "w") as f:
    json.dump(sorted(mg_c9), f, indent=2)