In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import pickle
import pandas as pd
import scanpy as sc

In [None]:
sample = 'iGlut_post' #choose from ['iGlut_post','iGlut_pre','iGABA_pre','iGABA_post']:


In [3]:
print(sample)

meta = pd.read_csv("scanpy/"+sample+"_dr_clustered_raw_merged_meta.tsv",sep="\t",index_col=0)
meta['M_CycA'] = meta['CycA']
meta.drop('CycA',axis=1,inplace=True)
meta.head()

meta['condition'] = meta['AP_axis'] + "_" + meta['DV_axis']
print(meta['Basal_media'].value_counts())

adata = sc.read_h5ad("scanpy/"+sample+"_dr_clustered_raw_merged.h5ad")
adata.obs['BC'] = adata.obs.index
adata = adata[meta.index,:]

dgem = pd.DataFrame.sparse.from_spmatrix(adata.X)
dgem.index = adata.obs.index
dgem.columns = adata.var_names
dgem.head()


morphogens = ['XAV','CHIR','RA','FGF8','BMP4','SHH','CycA'] #don't add PM
morphogens = ['M_'+x for x in morphogens]



iGABA_post
mTeSR    85756
Name: Basal_media, dtype: int64


In [4]:
ctrl_cells = list(meta.loc[meta['condition'].isin(['ctrl_ctrl'])].index)
g1_cells = list(meta.index)
sel_cells = ctrl_cells+g1_cells

In [7]:
human_TFs = pd.read_csv("/home/jjanssens/jjans/resources/geneLists/human_TFs/utoronto_human_tfs_v_1.01.txt",sep="\t",header=None)
human_TFs = list(human_TFs[0])
human_TFs = [x for x in human_TFs if x in adata.var_names]

In [8]:
#adata_subset = adata[sel_cells,:].copy()
expression_group1 = adata[g1_cells,human_TFs].X
expression_group2 = adata[ctrl_cells,human_TFs].X


In [9]:
expression_group1

<85756x1622 sparse matrix of type '<class 'numpy.float32'>'
	with 14209380 stored elements in Compressed Sparse Row format>

In [10]:
from scipy.spatial import distance_matrix
from scipy.spatial.distance import cdist, pdist


In [None]:
#cosine distances
expression_group1 = expression_group1.todense()
expression_group2 = expression_group2.todense()

distances = cdist(expression_group1, expression_group2, metric='cosine')
distances = pd.DataFrame(distances)

distances.index = g1_cells
distances.columns = ctrl_cells

distances.to_csv(sample+'_cosine_distances.tsv',sep='\t')

In [None]:
dist_summary = []
for cell1 in g1_cells:
    distances_against_ctrl = distances.loc[cell1,ctrl_cells]
    min_mean_dist_against_ctrl = np.mean(distances_against_ctrl.sort_values()[0:10])
    dist_summary.append(dict(cell=cell1,mdist=min_mean_dist_against_ctrl))
dist_summary = pd.DataFrame(dist_summary)

In [None]:
dist_summary.index = dist_summary['cell']

meta['mdist'] = 1
meta.loc[dist_summary.index,'mdist'] = dist_summary.loc[dist_summary.index,'mdist']

In [None]:
dist_summary.to_csv(sample+"_cosine_distances_summary.tsv",sep='\t')
distances.to_csv(sample+'_cosine_distances.tsv',sep='\t')

In [None]:
X_pca = adata.obsm['X_pca']
X_pca = pd.DataFrame(X_pca,index=adata.obs_names)

#adata_subset = adata[sel_cells,:].copy()
expression_group1 = X_pca.loc[g1_cells,0:49]
expression_group2 = X_pca.loc[ctrl_cells,0:49]

distances = cdist(expression_group1, expression_group2, metric='euclidean')
distances = pd.DataFrame(distances)
distances.index = g1_cells
distances.columns = ctrl_cells
distances.to_csv(sample+'_euclid_distances_PCA_50.tsv',sep='\t')

In [None]:
dist_summary = []
for cell1 in g1_cells:
    distances_against_ctrl = distances.loc[cell1,ctrl_cells]
    min_mean_dist_against_ctrl = np.mean(distances_against_ctrl.sort_values()[0:10])
    dist_summary.append(dict(cell=cell1,mdist=min_mean_dist_against_ctrl))
dist_summary = pd.DataFrame(dist_summary)

dist_summary.index = dist_summary['cell']

In [None]:
dist_summary.to_csv(sample+"_euclid_distances_PCA_50_summary.tsv",sep='\t')