In [1]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

sys.path.append("/mnt/lareaulab/reliscu/code")

from parse_gtf import *

In [None]:
psi = pd.read_csv(f"data/hahn_2023_cortex_STAR_exon_PSI.csv")
top_qval_mods_df = pd.read_csv("data/enrichments/")

### Add gene names to PSI data

In [5]:
# Parse GTF attribute column
gtf_file = "/mnt/lareaulab/reliscu/data/GENCODE/GRCm39/gencode.vM35.annotation.gtf"
gtf = gtf_parse(gtf_file)
gtf_subset = gtf.loc[gtf['feature'].isin(["gene"])]
attrs = gtf_subset["attribute"].apply(extract_attributes)
attrs_df = attrs.apply(pd.Series)
gtf_parsed = pd.concat([gtf_subset.drop(columns=["attribute"]), attrs_df], axis=1)

In [10]:
# Get PSI and GTF data ready to merge on gene IDs
gtf_parsed['gene_id'] = gtf_parsed['gene_id'].str.split(".").str[0]
psi['gene_id'] = psi.index.str.split("_").str[0]
psi['exon_id'] = psi.index.values

In [13]:
psi_anno = pd.merge(gtf_parsed[['gene_id', 'gene_name']], psi, on="gene_id", how="right")
psi_anno = psi_anno.set_index("exon_id").rename_axis(None)
psi_anno = psi_anno.drop(columns=["gene_id"])

### Calc. corr between ME and exon PSI

In [15]:
corr_df = pd.DataFrame(
    columns=["Gene"] + top_qval_mods_df['Cell_type'].tolist(), 
    index=psi_anno.index
)
corr_df['Gene'] = psi_anno['gene_name'] 

for i, row in top_qval_mods_df.iterrows():
    ctype = row['Cell_type']

    mod_df = pd.read_csv(row['ME_path'])
    mod_eig = mod_df.set_index("Sample")[row['Module']]
    mod_eig = pd.to_numeric(mod_eig, errors="coerce")
    
    corrs = psi_anno.iloc[:, 1:].corrwith(mod_eig, axis=1)
    corr_df[ctype] = corrs

In [16]:
corr_df.head()

Unnamed: 0,Gene,SMC-Peri,Pvalb,Lamp5,L2_3_IT,L6b,L6_CT,L4_5_IT,Vip,L5_6_NP,Sst_Chodl,L6_IT,Sncg,L5_PT,L5_IT
ENSMUSG00000033845_ProteinCoding_1,Mrpl15,0.292223,-0.14482,0.020312,0.186401,0.018275,-0.19735,0.037798,-0.071198,-0.026732,-0.015332,0.093463,-0.00295,0.128743,-0.138159
ENSMUSG00000025903_ProteinCoding_1,Lypla1,0.01009,-0.047604,-0.153901,0.209876,-0.06276,-0.020802,0.043826,0.130093,0.039971,0.103549,-0.02369,-0.060934,0.106918,-0.158337
ENSMUSG00000025903_ProteinCoding_2,Lypla1,0.087887,-0.055226,0.05841,0.134998,-0.174957,-0.066532,0.017559,0.03504,0.07761,0.050214,-0.049983,-0.008342,0.159381,-0.132973
ENSMUSG00000002459_ProteinCoding_1,Rgs20,0.044754,-0.12348,-0.132085,-0.088376,0.11044,0.096169,0.000393,-0.001209,0.040877,-0.143903,0.123422,-0.076833,0.212663,0.103155
ENSMUSG00000002459_ProteinCoding_2,Rgs20,0.050632,0.104558,0.031212,0.028741,0.224542,0.020624,-0.123236,-0.071114,0.0745,0.071965,-0.051668,0.118239,-0.128358,-0.143223


In [None]:
corr_df.to_csv(f"data/hahn_2023_cortex_STAR_{psi_data}_exon_corr.csv")