Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sdomanskyi committed Nov 8, 2019
1 parent 3e914ae commit cfd9bbc
Show file tree
Hide file tree
Showing 9 changed files with 317 additions and 11 deletions.
99 changes: 96 additions & 3 deletions DigitalCellSorter/VisualizationFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def makeMarkerExpressionPlot(self):

return

def makeMarkerSubplots(self, df, X_tsne, hugo_cd_dict, NoFrameOnFigures=False, HideClusterLabels=False, outlineClusters=True):
def makeMarkerSubplots(self, df, X_tsne, hugo_cd_dict, NoFrameOnFigures=False, HideClusterLabels=False, outlineClusters=True, analyzeBy='cluster'):

'''Produce subplots on each marker and its expression on all clusters
Expand Down Expand Up @@ -175,6 +175,10 @@ def makeMarkerSubplots(self, df, X_tsne, hugo_cd_dict, NoFrameOnFigures=False, H
outlineClusters: boolean, Default True
Whether to outline the clusters with circles
analyzeBy: str, Default 'cluster'
What level of lablels to include.
Other possible option is 'label'
Returns:
None
Expand Down Expand Up @@ -263,10 +267,12 @@ def MarkerSubplot(counter, marker, df, votingResults, X_tsne, cellClusterIndexLa

print('\nSaving marker subplots:\n')

votingAnalyzeBy = 'Predicted cell type' if analyzeBy=='label' else analyzeBy

df_votingResults = pd.read_excel(os.path.join(self.saveDir, self.dataName + '_voting.xlsx'), sheet_name='z-scores')
votingResults = dict(zip(df_votingResults['cluster'].values, df_votingResults['Predicted cell type'].values))
votingResults = dict(zip(df_votingResults[votingAnalyzeBy].values, df_votingResults['Predicted cell type'].values))

cellClusterIndexLabel = df.columns.get_level_values('cluster').values
cellClusterIndexLabel = df.columns.get_level_values(analyzeBy).values

for counter,marker in enumerate(df.index.values):
MarkerSubplot(counter, marker, pd.DataFrame(data=np.reshape(np.array(df.loc[marker]), (1,len(df.loc[marker]))), columns=df.columns, index=[marker]),
Expand Down Expand Up @@ -909,3 +915,90 @@ def makePlotOfNewMarkers(self, df_marker_cell_type, df_new_marker_cell_type):

return None

def makeTtestPlot(self, df, dfp, label=None, reorder=False, p_value_cutoff=0.05):

'''Produce heatmap plot of the new markers extracted from the annotated clusters
Parameters:
df: pandas.DataFrame
t-test p-Values calculated gene-pair-wise
label: str, Default None
Lebel to include in the plot
reorder: boolean, Default True
Reorder values to group similar
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.makeTtestPlot(df)
'''

if reorder:

def metricCommonEuclidean(u,v):

where_common = (~np.isnan(u)) * (~np.isnan(v))

return np.sqrt(((u[where_common] - v[where_common]) ** 2).sum())

order = scipy.cluster.hierarchy.dendrogram(scipy.cluster.hierarchy.linkage(df.values, method='average', metric=metricCommonEuclidean), no_plot=True, get_leaves=True)['leaves']

df = df[df.columns.values[order]]
dfp = dfp[dfp.columns.values[order]]
df = df.loc[df.index.values[order]]
dfp = dfp.loc[dfp.index.values[order]]

df = df[df.columns[::-1]]
dfp = dfp[dfp.columns[::-1]]

fig = plt.figure(figsize=(5,5))

ax = fig.add_axes([0.35,0.02,0.63,0.63])

cmap = plt.cm.PuOr_r #BrBG #PiYG #seismic
cmap.set_bad('grey')

ax.imshow(df.values.astype(float), cmap=cmap, interpolation='None', aspect='auto')

wh = np.where(dfp.values.T<=p_value_cutoff)
ax.plot(wh[0], wh[1], '*k')

ax.set_xticks(range(df.shape[1]))
ax.set_yticks(range(df.shape[0]))
ax.set_xticklabels(df.columns.values, rotation=90, fontsize=8)
ax.set_yticklabels(df.index.values, rotation=0, fontsize=8)
ax.set_xlim([-0.5, df.shape[1] - 0.5])
ax.set_ylim([-0.5, df.shape[0] - 0.5])

ax.xaxis.tick_top()

if not label is None:
ax.text(-0.5, 1.5, label, transform=ax.transAxes, fontsize=10, color='k', ha='left', va='top').set_path_effects(
[path_effects.Stroke(linewidth=0.5, foreground='blue'),path_effects.Normal()])

ax.set_title('Two-tailed p-Value (t-test)')

data = df.values.flatten().astype(float)
data = data[np.where(~np.isnan(data))]
dataMin = np.min(data)
dataMax = np.max(data)

axisColor = fig.add_axes([0.22,0.75,0.08,0.02])

norm=matplotlib.colors.Normalize(vmin=dataMin, vmax=dataMax)
mapp=cm.ScalarMappable(norm=norm, cmap=cmap)
mapp.set_array(data)
fig.colorbar(mapp, cax=axisColor, ticks=[dataMax,dataMin], orientation='horizontal')
axisColor.tick_params(labelsize=4)
axisColor.set_xlabel('Statistic\n*p-Value < %s'%(p_value_cutoff), fontsize=5)

axisColor.set_yticklabels([np.round(dataMax,2), np.round(dataMin,2)])

fig.savefig(os.path.join(self.saveDir, self.dataName + '_ttest_%s.png'%(label.replace('\n', '_'))), dpi=300)

return None
116 changes: 108 additions & 8 deletions DigitalCellSorter/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,56 @@ def getAnomalyScores(self, trainingSet, testingSet, printResults=False):

return scores

def getExprOfGene(self, gene, analyzeBy='cluster'):

'''Get expression of a gene.
Run this function only after function process()
Parameters:
cells: pandas.MultiIndex
Index of cells of interest
analyzeBy: str, Default 'cluster'
What level of lablels to include.
Other possible options are 'label' and 'celltype'
Returns:
pandas.DataFrame
With expression of the cells of interest
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.getExprOfGene('SDC1')
'''

try:
df_markers_expr = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_expr', mode='r').xs(key=gene, axis=0, level=2).T
except:
print('Gene %s not in index'%(gene))
return

df_markers_expr.index = [gene]

labelled = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_markers_expr', mode='r').columns

columns = pd.MultiIndex.from_arrays([labelled.get_level_values('batch'), labelled.get_level_values('cell')])
df_markers_expr = df_markers_expr.reindex(columns, axis=1).fillna(0.)

if analyzeBy=='celltype':
analyzeBy = 'label'
columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index

df_markers_expr.columns = pd.MultiIndex.from_arrays([columns.get_level_values('batch'),
columns.get_level_values('cell'),
columns.get_level_values(analyzeBy).str.split(' #', expand=True).get_level_values(0)], names=['batch', 'cell','celltype'])
else:
df_markers_expr.columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index

return df_markers_expr

def getExprOfCells(self, cells):

'''Get expression of a set of cells.
Expand All @@ -1178,7 +1228,7 @@ def getExprOfCells(self, cells):
DCS.process()
DCS.getDfExprOfSelectedCells(cells)
DCS.getExprOfCells(cells)
'''

df_expr = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_expr', mode='r')
Expand Down Expand Up @@ -1231,6 +1281,51 @@ def getAnomalyScoresPlot(self, cells='All'):

return None

def getIndividualGeneTtestPlot(self, gene, analyzeBy='label'):

'''Produce individual gene t-test plot of the two-tailed p-value.
Parameters:
gene: str
Name of gene of interest
analyzeBy: str, Default 'cluster'
What level of lablels to include.
Other possible options are 'label' and 'celltype'
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.getIndividualGeneTtestPlot('SDC1')
'''

df_markers_expr = self.getExprOfGene(gene, analyzeBy=analyzeBy)

if df_markers_expr is None:

return None

groups = np.unique(df_markers_expr.columns.get_level_values(analyzeBy).values)

ttestStatistic = pd.DataFrame(index=groups, columns=groups)
ttestpValue = pd.DataFrame(index=groups, columns=groups)
for groupA in groups:
for groupB in groups:
A = df_markers_expr.xs(key=groupA, level=analyzeBy, axis=1).values.squeeze()
B = df_markers_expr.xs(key=groupB, level=analyzeBy, axis=1).values.squeeze()
ttestStatistic.loc[groupA, groupB], ttestpValue.loc[groupA, groupB] = scipy.stats.ttest_ind(A[np.where(A!=0)], B[np.where(B!=0)])

alt = self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]
alt = [alt] if type(alt) is str else alt
hugo_cd_dict = '%s\n(%s)'%(gene, ('\n').join(list(alt)))

self.makeTtestPlot(ttestStatistic, ttestpValue, label=hugo_cd_dict)

return None

def getIndividualGeneExpressionPlot(self, gene, hideClusterLabels=False, outlineClusters=True):

'''Produce individual gene expression plot on a tSNE layout
Expand All @@ -1254,17 +1349,22 @@ def getIndividualGeneExpressionPlot(self, gene, hideClusterLabels=False, outline
DCS.getIndividualGeneExpressionPlot('CD4')
'''

df_markers_expr = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_expr', mode='r').xs(key=gene, axis=0, level=2).T
df_markers_expr.index = [gene]
df_markers_expr = self.getExprOfGene(gene, analyzeBy='cluster')

df_tsne = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_tsne', mode='r')
df_markers_expr = df_markers_expr.reindex(df_tsne.columns, axis=1).fillna(0.)
if df_markers_expr is None:

labelled = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_markers_expr', mode='r').columns
df_markers_expr.columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])['cluster'].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', 'cluster']).index
return None

df_tsne = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_tsne', mode='r')

hugo_cd_dict = {gene: self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]}
self.makeMarkerSubplots(df_markers_expr, df_tsne.values, hugo_cd_dict, self.dataName, self.saveDir, NoFrameOnFigures=True, HideClusterLabels=hideClusterLabels, outlineClusters=outlineClusters)

self.makeMarkerSubplots(df_markers_expr,
df_tsne.values,
hugo_cd_dict,
NoFrameOnFigures=True,
HideClusterLabels=hideClusterLabels,
outlineClusters=outlineClusters)

return None

Expand Down
Binary file modified DigitalCellSorter/geneLists/CIBERSORT.xlsx
Binary file not shown.
Binary file modified DigitalCellSorter/geneLists/markersDCS.xlsx
Binary file not shown.
Binary file modified data.dat
Binary file not shown.
111 changes: 111 additions & 0 deletions docs/source/data.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
**Input Data Format**
=====================

Gene Expression Data Format


The input gene expression data is expected in one of the following formats:

1. Spreadsheet of comma-separated values ``csv`` containing condensed matrix in a form ``('cell', 'gene', 'expr')``.
If there are batches in the data the matrix has to be of the form ``('batch', 'cell', 'gene', 'expr')``. Columns order can be arbitrary.


+------+------+------+
| cell | gene | expr |
+======+======+======+
| C1 | G1 | 3 |
+------+------+------+
| C1 | G2 | 2 |
+------+------+------+
| C1 | G3 | 1 |
+------+------+------+
| C2 | G1 | 1 |
+------+------+------+
| C2 | G4 | 5 |
+------+------+------+
| ... | ... | ... |
+------+------+------+

or:

+--------+------+------+------+
| batch | cell | gene | expr |
+========+======+======+======+
| batch0 | C1 | G1 | 3 |
+--------+------+------+------+
| batch0 | C1 | G2 | 2 |
+--------+------+------+------+
| batch0 | C1 | G3 | 1 |
+--------+------+------+------+
| batch1 | C2 | G1 | 1 |
+--------+------+------+------+
| batch1 | C2 | G4 | 5 |
+--------+------+------+------+
| ... | ... | ... | ... |
+--------+------+------+------+




2. Spreadsheet of comma-separated values ``csv`` where rows are genes, columns are cells with gene expression counts.
If there are batches in the data the spreadsheet the first row should be ``'batch'`` and the second ``'cell'``.


+-------+--------+--------+--------+--------+
| cell | C1 | C2 | C3 | C4 |
+=======+========+========+========+========+
| G1 | | 3 | 1 | 7 |
+-------+--------+--------+--------+--------+
| G2 | 2 | 2 | | 2 |
+-------+--------+--------+--------+--------+
| G3 | 3 | 1 | | 5 |
+-------+--------+--------+--------+--------+
| G4 | 10 | | 5 | 4 |
+-------+--------+--------+--------+--------+
| ... | ... | ... | ... | ... |
+-------+--------+--------+--------+--------+

or:

+-------+--------+--------+--------+--------+
| batch | batch0 | batch0 | batch1 | batch1 |
+=======+========+========+========+========+
| cell | C1 | C2 | C3 | C4 |
+-------+--------+--------+--------+--------+
| G1 | | 3 | 1 | 7 |
+-------+--------+--------+--------+--------+
| G2 | 2 | 2 | | 2 |
+-------+--------+--------+--------+--------+
| G3 | 3 | 1 | | 5 |
+-------+--------+--------+--------+--------+
| G4 | 10 | | 5 | 4 |
+-------+--------+--------+--------+--------+
| ... | ... | ... | ... | ... |
+-------+--------+--------+--------+--------+

3. ``Pandas DataFrame`` where ``axis 0`` is genes and ``axis 1`` are cells.
If the are batched in the data then the index of ``axis 1`` should have two levels, e.g. ``('batch', 'cell')``,
with the first level indicating patient, batch or expreriment where that cell was sequenced, and the
second level containing cell barcodes for identification.

.. code:: python
df = pd.DataFrame(data=[[2,np.nan],[3,8],[3,5],[np.nan,1]],
index=['G1','G2','G3','G4'],
columns=pd.MultiIndex.from_arrays([['batch0','batch1'],['C1','C2']], names=['batch', 'cell']))
4. ``Pandas Series`` where index should have two levels, e.g. ``('cell', 'gene')``. If there are batched in the data
the first level should be indicating patient, batch or expreriment where that cell was sequenced, the second level cell barcodes for
identification and the third level gene names.

.. code:: python
se = pd.Series(data=[1,8,3,5,5],
index=pd.MultiIndex.from_arrays([['batch0','batch0','batch1','batch1','batch1'],
['C1','C1','C1','C2','C2'],
['G1','G2','G3','G1','G4']], names=['batch', 'cell', 'gene']))
Any of the data types outlined above need to be prepared/validated with a function ``prepare()``.

1 change: 1 addition & 0 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
**Demo**
============


.. literalinclude:: ../examples/demo.py


Loading

0 comments on commit cfd9bbc

Please sign in to comment.