update

sdomanskyi · Nov 8, 2019 · cfd9bbc · cfd9bbc
1 parent 3e914ae
commit cfd9bbc
Show file tree

Hide file tree

Showing 9 changed files with 317 additions and 11 deletions.
diff --git a/DigitalCellSorter/VisualizationFunctions.py b/DigitalCellSorter/VisualizationFunctions.py
@@ -146,7 +146,7 @@ def makeMarkerExpressionPlot(self):
 
         return
 
-    def makeMarkerSubplots(self, df, X_tsne, hugo_cd_dict, NoFrameOnFigures=False, HideClusterLabels=False, outlineClusters=True):
+    def makeMarkerSubplots(self, df, X_tsne, hugo_cd_dict, NoFrameOnFigures=False, HideClusterLabels=False, outlineClusters=True, analyzeBy='cluster'):
 
         '''Produce subplots on each marker and its expression on all clusters
 
@@ -175,6 +175,10 @@ def makeMarkerSubplots(self, df, X_tsne, hugo_cd_dict, NoFrameOnFigures=False, H
             outlineClusters: boolean, Default True
                 Whether to outline the clusters with circles
 
+            analyzeBy: str, Default 'cluster'
+                What level of lablels to include.
+                Other possible option is 'label'
+
         Returns:
             None
         
@@ -263,10 +267,12 @@ def MarkerSubplot(counter, marker, df, votingResults, X_tsne, cellClusterIndexLa
 
         print('\nSaving marker subplots:\n')
 
+        votingAnalyzeBy = 'Predicted cell type' if analyzeBy=='label' else analyzeBy
+
         df_votingResults = pd.read_excel(os.path.join(self.saveDir, self.dataName + '_voting.xlsx'), sheet_name='z-scores')
-        votingResults = dict(zip(df_votingResults['cluster'].values, df_votingResults['Predicted cell type'].values))
+        votingResults = dict(zip(df_votingResults[votingAnalyzeBy].values, df_votingResults['Predicted cell type'].values))
 
-        cellClusterIndexLabel = df.columns.get_level_values('cluster').values
+        cellClusterIndexLabel = df.columns.get_level_values(analyzeBy).values
 
         for counter,marker in enumerate(df.index.values):
             MarkerSubplot(counter, marker, pd.DataFrame(data=np.reshape(np.array(df.loc[marker]), (1,len(df.loc[marker]))), columns=df.columns, index=[marker]), 
@@ -909,3 +915,90 @@ def makePlotOfNewMarkers(self, df_marker_cell_type, df_new_marker_cell_type):
 
         return None
 
+    def makeTtestPlot(self, df, dfp, label=None, reorder=False, p_value_cutoff=0.05):
+
+        '''Produce heatmap plot of the new markers extracted from the annotated clusters
+
+        Parameters:
+            df: pandas.DataFrame
+                t-test p-Values calculated gene-pair-wise
+
+            label: str, Default None
+                Lebel to include in the plot
+                
+            reorder: boolean, Default True
+                Reorder values to group similar
+
+        Returns:
+            None
+        
+        Usage:
+            DCS = DigitalCellSorter.DigitalCellSorter()
+
+            DCS.makeTtestPlot(df)
+        '''
+
+        if reorder:
+
+            def metricCommonEuclidean(u,v):
+
+                where_common = (~np.isnan(u)) * (~np.isnan(v))
+
+                return np.sqrt(((u[where_common] - v[where_common]) ** 2).sum())
+
+            order = scipy.cluster.hierarchy.dendrogram(scipy.cluster.hierarchy.linkage(df.values, method='average', metric=metricCommonEuclidean), no_plot=True, get_leaves=True)['leaves']
+
+            df = df[df.columns.values[order]]
+            dfp = dfp[dfp.columns.values[order]]
+            df = df.loc[df.index.values[order]]
+            dfp = dfp.loc[dfp.index.values[order]]
+
+        df = df[df.columns[::-1]]
+        dfp = dfp[dfp.columns[::-1]]
+
+        fig = plt.figure(figsize=(5,5))
+
+        ax = fig.add_axes([0.35,0.02,0.63,0.63])
+
+        cmap = plt.cm.PuOr_r #BrBG #PiYG #seismic
+        cmap.set_bad('grey')
+
+        ax.imshow(df.values.astype(float), cmap=cmap, interpolation='None', aspect='auto')
+
+        wh = np.where(dfp.values.T<=p_value_cutoff)
+        ax.plot(wh[0], wh[1], '*k')
+
+        ax.set_xticks(range(df.shape[1]))
+        ax.set_yticks(range(df.shape[0]))
+        ax.set_xticklabels(df.columns.values, rotation=90, fontsize=8)
+        ax.set_yticklabels(df.index.values, rotation=0, fontsize=8)
+        ax.set_xlim([-0.5, df.shape[1] - 0.5])
+        ax.set_ylim([-0.5, df.shape[0] - 0.5])
+
+        ax.xaxis.tick_top()
+
+        if not label is None:
+            ax.text(-0.5, 1.5, label, transform=ax.transAxes, fontsize=10, color='k', ha='left', va='top').set_path_effects(
+                [path_effects.Stroke(linewidth=0.5, foreground='blue'),path_effects.Normal()])
+
+        ax.set_title('Two-tailed p-Value (t-test)')
+
+        data = df.values.flatten().astype(float)
+        data = data[np.where(~np.isnan(data))]
+        dataMin = np.min(data)
+        dataMax = np.max(data)
+
+        axisColor = fig.add_axes([0.22,0.75,0.08,0.02])
+
+        norm=matplotlib.colors.Normalize(vmin=dataMin, vmax=dataMax)
+        mapp=cm.ScalarMappable(norm=norm, cmap=cmap)
+        mapp.set_array(data)
+        fig.colorbar(mapp, cax=axisColor, ticks=[dataMax,dataMin], orientation='horizontal')
+        axisColor.tick_params(labelsize=4)
+        axisColor.set_xlabel('Statistic\n*p-Value < %s'%(p_value_cutoff), fontsize=5)
+
+        axisColor.set_yticklabels([np.round(dataMax,2), np.round(dataMin,2)])
+
+        fig.savefig(os.path.join(self.saveDir, self.dataName + '_ttest_%s.png'%(label.replace('\n', '_'))), dpi=300)
+
+        return None
diff --git a/DigitalCellSorter/core.py b/DigitalCellSorter/core.py
@@ -1160,6 +1160,56 @@ def getAnomalyScores(self, trainingSet, testingSet, printResults=False):
 
         return scores
 
+    def getExprOfGene(self, gene, analyzeBy='cluster'):
+
+        '''Get expression of a gene.
+        Run this function only after function process()
+
+        Parameters:
+            cells: pandas.MultiIndex
+                Index of cells of interest
+
+            analyzeBy: str, Default 'cluster'
+                What level of lablels to include.
+                Other possible options are 'label' and 'celltype'
+
+        Returns:
+            pandas.DataFrame
+                With expression of the cells of interest
+
+        Usage:
+            DCS = DigitalCellSorter.DigitalCellSorter()
+
+            DCS.process()
+
+            DCS.getExprOfGene('SDC1')
+        '''
+
+        try:
+            df_markers_expr = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_expr', mode='r').xs(key=gene, axis=0, level=2).T
+        except:
+            print('Gene %s not in index'%(gene))
+            return
+
+        df_markers_expr.index = [gene]
+
+        labelled = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_markers_expr', mode='r').columns
+
+        columns = pd.MultiIndex.from_arrays([labelled.get_level_values('batch'), labelled.get_level_values('cell')])
+        df_markers_expr = df_markers_expr.reindex(columns, axis=1).fillna(0.)
+
+        if analyzeBy=='celltype':
+            analyzeBy = 'label'
+            columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index
+
+            df_markers_expr.columns = pd.MultiIndex.from_arrays([columns.get_level_values('batch'), 
+                                                                 columns.get_level_values('cell'), 
+                                                                 columns.get_level_values(analyzeBy).str.split(' #', expand=True).get_level_values(0)], names=['batch', 'cell','celltype'])
+        else:
+            df_markers_expr.columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index
+
+        return df_markers_expr
+
     def getExprOfCells(self, cells):
 
         '''Get expression of a set of cells.
@@ -1178,7 +1228,7 @@ def getExprOfCells(self, cells):
 
             DCS.process()
 
-            DCS.getDfExprOfSelectedCells(cells)
+            DCS.getExprOfCells(cells)
         '''
 
         df_expr = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_expr', mode='r')
@@ -1231,6 +1281,51 @@ def getAnomalyScoresPlot(self, cells='All'):
 
         return None
 
+    def getIndividualGeneTtestPlot(self, gene, analyzeBy='label'):
+
+        '''Produce individual gene t-test plot of the two-tailed p-value.
+
+        Parameters:
+            gene: str
+                Name of gene of interest
+
+            analyzeBy: str, Default 'cluster'
+                What level of lablels to include.
+                Other possible options are 'label' and 'celltype'
+
+        Returns:
+            None
+        
+        Usage:
+            DCS = DigitalCellSorter.DigitalCellSorter()
+
+            DCS.getIndividualGeneTtestPlot('SDC1')
+        '''
+
+        df_markers_expr = self.getExprOfGene(gene, analyzeBy=analyzeBy)
+
+        if df_markers_expr is None:
+
+            return None
+
+        groups = np.unique(df_markers_expr.columns.get_level_values(analyzeBy).values)
+
+        ttestStatistic = pd.DataFrame(index=groups, columns=groups)
+        ttestpValue = pd.DataFrame(index=groups, columns=groups)
+        for groupA in groups:
+            for groupB in groups:
+                A = df_markers_expr.xs(key=groupA, level=analyzeBy, axis=1).values.squeeze()
+                B = df_markers_expr.xs(key=groupB, level=analyzeBy, axis=1).values.squeeze()
+                ttestStatistic.loc[groupA, groupB], ttestpValue.loc[groupA, groupB] = scipy.stats.ttest_ind(A[np.where(A!=0)], B[np.where(B!=0)])
+
+        alt = self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]
+        alt = [alt] if type(alt) is str else alt
+        hugo_cd_dict = '%s\n(%s)'%(gene, ('\n').join(list(alt)))
+
+        self.makeTtestPlot(ttestStatistic, ttestpValue, label=hugo_cd_dict)
+
+        return None    
+
     def getIndividualGeneExpressionPlot(self, gene, hideClusterLabels=False, outlineClusters=True):
 
         '''Produce individual gene expression plot on a tSNE layout
@@ -1254,17 +1349,22 @@ def getIndividualGeneExpressionPlot(self, gene, hideClusterLabels=False, outline
             DCS.getIndividualGeneExpressionPlot('CD4')
         '''
 
-        df_markers_expr = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_expr', mode='r').xs(key=gene, axis=0, level=2).T
-        df_markers_expr.index = [gene]
+        df_markers_expr = self.getExprOfGene(gene, analyzeBy='cluster')
 
-        df_tsne = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_tsne', mode='r')
-        df_markers_expr = df_markers_expr.reindex(df_tsne.columns, axis=1).fillna(0.)
+        if df_markers_expr is None:
 
-        labelled = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_markers_expr', mode='r').columns
-        df_markers_expr.columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])['cluster'].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', 'cluster']).index
+            return None
 
+        df_tsne = pd.read_hdf(os.path.join(self.saveDir, self.dataName + '_processed.h5'), key='df_tsne', mode='r')
+
         hugo_cd_dict = {gene: self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]}
-        self.makeMarkerSubplots(df_markers_expr, df_tsne.values, hugo_cd_dict, self.dataName, self.saveDir, NoFrameOnFigures=True, HideClusterLabels=hideClusterLabels, outlineClusters=outlineClusters)
+
+        self.makeMarkerSubplots(df_markers_expr, 
+                                df_tsne.values, 
+                                hugo_cd_dict, 
+                                NoFrameOnFigures=True, 
+                                HideClusterLabels=hideClusterLabels, 
+                                outlineClusters=outlineClusters)
 
         return None
 

diff --git a/DigitalCellSorter/geneLists/CIBERSORT.xlsx b/DigitalCellSorter/geneLists/CIBERSORT.xlsx
diff --git a/DigitalCellSorter/geneLists/markersDCS.xlsx b/DigitalCellSorter/geneLists/markersDCS.xlsx
diff --git a/data.dat b/data.dat
diff --git a/docs/source/data.rst b/docs/source/data.rst
@@ -0,0 +1,111 @@
+**Input Data Format**
+=====================
+
+Gene Expression Data Format
+
+
+The input gene expression data is expected in one of the following formats:
+
+1. Spreadsheet of comma-separated values ``csv`` containing condensed matrix in a form ``('cell', 'gene', 'expr')``. 
+If there are batches in the data the matrix has to be of the form ``('batch', 'cell', 'gene', 'expr')``. Columns order can be arbitrary.
+
+
++------+------+------+
+| cell | gene | expr |
++======+======+======+
+| C1   | G1   | 3    |
++------+------+------+
+| C1   | G2   | 2    |
++------+------+------+
+| C1   | G3   | 1    |
++------+------+------+
+| C2   | G1   | 1    |
++------+------+------+
+| C2   | G4   | 5    |
++------+------+------+
+| ...  | ...  | ...  |
++------+------+------+
+
+or:
+
++--------+------+------+------+
+| batch  | cell | gene | expr |
++========+======+======+======+
+| batch0 | C1   | G1   | 3    |
++--------+------+------+------+
+| batch0 | C1   | G2   | 2    |
++--------+------+------+------+
+| batch0 | C1   | G3   | 1    |
++--------+------+------+------+
+| batch1 | C2   | G1   | 1    |
++--------+------+------+------+
+| batch1 | C2   | G4   | 5    |
++--------+------+------+------+
+| ...    | ...  | ...  | ...  |
++--------+------+------+------+
+
+
+
+
+2. Spreadsheet of comma-separated values ``csv`` where rows are genes, columns are cells with gene expression counts.
+If there are batches in the data the spreadsheet the first row should be ``'batch'`` and the second ``'cell'``.
+
+
++-------+--------+--------+--------+--------+
+| cell  | C1     | C2     | C3     | C4     |
++=======+========+========+========+========+
+| G1    |        | 3      | 1      | 7      |
++-------+--------+--------+--------+--------+
+| G2    | 2      | 2      |        | 2      |
++-------+--------+--------+--------+--------+
+| G3    | 3      | 1      |        | 5      |
++-------+--------+--------+--------+--------+
+| G4    | 10     |        | 5      | 4      |
++-------+--------+--------+--------+--------+
+| ...   | ...    | ...    | ...    | ...    |
++-------+--------+--------+--------+--------+
+
+or:
+
++-------+--------+--------+--------+--------+
+| batch | batch0 | batch0 | batch1 | batch1 |
++=======+========+========+========+========+
+| cell  | C1     | C2     | C3     | C4     |
++-------+--------+--------+--------+--------+
+| G1    |        | 3      | 1      | 7      |
++-------+--------+--------+--------+--------+
+| G2    | 2      | 2      |        | 2      |
++-------+--------+--------+--------+--------+
+| G3    | 3      | 1      |        | 5      |
++-------+--------+--------+--------+--------+
+| G4    | 10     |        | 5      | 4      |
++-------+--------+--------+--------+--------+
+| ...   | ...    | ...    | ...    | ...    |
++-------+--------+--------+--------+--------+
+
+3. ``Pandas DataFrame`` where ``axis 0`` is genes and ``axis 1`` are cells.
+If the are batched in the data then the index of ``axis 1`` should have two levels, e.g. ``('batch', 'cell')``, 
+with the first level indicating patient, batch or expreriment where that cell was sequenced, and the
+second level containing cell barcodes for identification.
+
+.. code:: python
+
+    df = pd.DataFrame(data=[[2,np.nan],[3,8],[3,5],[np.nan,1]], 
+                      index=['G1','G2','G3','G4'], 
+                      columns=pd.MultiIndex.from_arrays([['batch0','batch1'],['C1','C2']], names=['batch', 'cell']))    
+
+
+4. ``Pandas Series`` where index should have two levels, e.g. ``('cell', 'gene')``. If there are batched in the data
+the first level should be indicating patient, batch or expreriment where that cell was sequenced, the second level cell barcodes for 
+identification and the third level gene names.
+
+.. code:: python
+
+    se = pd.Series(data=[1,8,3,5,5], 
+                   index=pd.MultiIndex.from_arrays([['batch0','batch0','batch1','batch1','batch1'],
+                                                    ['C1','C1','C1','C2','C2'],
+                                                    ['G1','G2','G3','G1','G4']], names=['batch', 'cell', 'gene']))
+
+
+Any of the data types outlined above need to be prepared/validated with a function ``prepare()``. 
+
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -1,6 +1,7 @@
 **Demo**
 ============
 
+
 .. literalinclude:: ../examples/demo.py