In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.linear_model import ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

In [2]:
adata = sc.read_h5ad("./data/hultcd34_full.h5ad")

metadata = pd.read_csv('./data/metadata.csv').rename(columns={'Unnamed: 0': 'cell'}).set_index('cell')
adata.obs = metadata

everything that is NOT uncommitted in `cluster` col in metadata df are lineage progenitors

HSCs are denoted in `subcluster` col in metadata df

In [3]:
# code to normalize the data, just uncomment and run

# sc.pp.normalize_total(adata)
# sc.pp.log1p(adata)

In [7]:
df = adata.to_df()

cell_GEP_mapping = pd.read_csv(r'./data/Factorized matrices from human lifetime scRNA(GEP usage per cell).csv').set_index('Cell')
target = cell_GEP_mapping[['GEP 15 (DNA Replication)']]

df = (df
      .merge(target, left_index=True, right_index=True)
      .merge(metadata, left_index=True, right_index=True)
      )

df.head()

Unnamed: 0,X5S-rRNA,X5-8S-rRNA,X7SK,A1BG,A1BG.AS1,A1CF,A2M,A2M.AS1,A2ML1,A2ML1.AS1,...,GEP 15 (DNA Replication),orig.ident,nCount_RNA,nFeature_RNA,sample,age,cluster,broad_age_range,narrow_age_range,subcluster
BM1_bcBSQM,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.111207,SeuratProject,5125,2140,BM1,25yr-1,Uncommitted,Adult,Adult,MPP-1
BM1_bcCHLB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07033,SeuratProject,5050,2116,BM1,25yr-1,Uncommitted,Adult,Adult,HSC
BM1_bcFZFC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.161894,SeuratProject,4761,2024,BM1,25yr-1,Uncommitted,Adult,Adult,MPP-1
BM1_bcGZEW,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.083941,SeuratProject,4119,1754,BM1,25yr-1,Uncommitted,Adult,Adult,HSC
BM1_bcFMRM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.225007,SeuratProject,4575,1952,BM1,25yr-1,Uncommitted,Adult,Adult,MPP-1


# HSC regression

In [11]:
def get_X_HSC():
    return df[df['subcluster'] == 'HSC'].iloc[:, :-10]

def get_y_HSC():
    return df[df['subcluster'] == 'HSC']['GEP 15 (DNA Replication)']

In [None]:
X_train_HSC, X_test_HSC, y_train_HSC, y_test_HSC = train_test_split(get_X_HSC(), get_y_HSC())

HSC_model = LinearRegression().fit(X_train_HSC, y_train_HSC, n_jobs=-1)

In [None]:
print(f'HSC model R^2 (test): {HSC_model.score(X_test_HSC, y_test_HSC)}')

HSC model R^2: 0.3664712870371172


In [None]:
print(f'HSC model R^2 (train): {HSC_model.score(X_train_HSC, y_train_HSC)}')

In [16]:
HSC_weights = pd.DataFrame({
    'Feature': X_train_HSC.columns,
    'Weight': HSC_model.coef_
})
HSC_weights = pd.concat(
    [HSC_weights, pd.DataFrame({'Feature': ['Intercept'], 'Weight': [HSC_model.intercept_]})]
)
HSC_weights

Unnamed: 0,Feature,Weight
0,X5S-rRNA,0.000323
1,X5-8S-rRNA,0.001333
2,X7SK,0.000513
3,A1BG,0.000572
4,A1BG.AS1,0.000335
...,...,...
41565,snoZ40,-0.000033
41566,snoZ6,-0.000037
41567,snosnR66,0.000000
41568,uc-338,-0.001148


# progenitor regression

In [19]:
def get_X_prog():
    return df[df['cluster'] != 'Uncommitted'].iloc[:, :-10]

def get_y_prog():
    return df[df['cluster'] != 'Uncommitted']['GEP 15 (DNA Replication)']

In [None]:
X_train_prog, X_test_prog, y_train_prog, y_test_prog = train_test_split(get_X_prog(), get_y_prog())

prog_model = LinearRegression().fit(X_train_prog, y_train_prog, n_jobs=-1)

In [None]:
print(f'prog model R^2 (test): {prog_model.score(X_test_prog, y_test_prog)}')

In [None]:
print(f'prog model R^2 (train): {prog_model.score(X_train_prog, y_train_prog)}')

In [None]:
prog_weights = pd.DataFrame({
    'Feature': X_train_prog.columns,
    'Weight': prog_model.coef_
})
prog_weights = pd.concat(
    [prog_weights, pd.DataFrame({'Feature': ['Intercept'], 'Weight': [prog_model.intercept_]})]
)
prog_weights