In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os, sys, gzip

# 2018-09-18 Cleaning data
Now I have the expression matrices and I know that certain columns correspond to the control conditions and others correspond to infected cells that are either GFP positive or negative. So I want to select properly the cells in each condition, and this comes by eliminating the dead cells. I will select the dead cells by eliminating the ones that have very few reads (that is already quite evident from visually looking at the matrices.

In [None]:
# define paths and file names
sc_hiv_rootdir = '%s/work/CRG/projects/sc_hiv'%(os.getenv('HOME'))
datadir = '%s/data'%(sc_hiv_rootdir)
matrices_dir = '%s/matrices'%(datadir)

In [None]:
class SsHIVExperiment :
    def __init__(self, sample_name) :
        self.sample_name = sample_name
        self.matrix_fname = '%s/%s.tsv.gz'%(matrices_dir, sample_name)

In [None]:
rows = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
columns = [str(i) for i in range(1, 13)]
names = []
for row in rows :
    for column in columns :
        names.append('%s%s'%(row, column))

def parse_expression_matrix(matrix_fname) :
    gene_to_idx = {}
    idx_to_gene = {}
    cell_to_idx = {}
    idx_to_cell = {}
    values = []
    with gzip.open(matrix_fname, 'r') as f :
        for lineno, line in enumerate(f) :
            curatedline = line.strip('\n').split('\t')
            if lineno==0 :
                cell_to_idx = {curatedline[i] : i-1 for i in range(1, len(names)+1)}
                idx_to_cell = {i-1 : curatedline[i] for i in range(1, len(names)+1)}
            else :
                idx_to_gene[lineno-1] = curatedline[0]
                gene_to_idx[curatedline[0]] = lineno-1
                values.append(curatedline[1:])
    expression = np.array(values, dtype=float)
    return gene_to_idx, idx_to_gene, cell_to_idx, idx_to_cell, expression

In [None]:
# matrices
matrix_fnames = os.listdir(matrices_dir)
experiments = []
for matrix_fname in matrix_fnames :
    if matrix_fname.endswith('.tsv.gz') :
        sample_name = matrix_fname.strip('.tsv.gz')
        experiment = SsHIVExperiment(sample_name)
        experiment.gene_to_idx,\
        experiment.idx_to_gene,\
        experiment.cell_to_idx,\
        experiment.idx_to_cell,\
        experiment.expression =\
                     parse_expression_matrix(experiment.matrix_fname)
        experiments.append(experiment)

Now we have a data structure that allows us to ask: which cells were dead? We will ask then which cells have a very low total level of expression of all the genes put together.

In [None]:
for experiment in experiments :
    experiment.total_expression = experiment.expression.sum(axis=0)
    fig, ax = plt.subplots(1, 1, figsize=(15,2))
    mbt.line_plot(ax, range(96), np.log10(experiment.total_expression), color='b', show_xaxis=True)
    ax.set_title(experiment.sample_name, fontsize=24)
    ax.set_xticks(range(96))
    ax.set_xticklabels(names, fontsize=8, rotation=90)
    ax.set_ylabel('Log EGFP expression')

Okay, so putting a threshold at about 10^5 should be fine to discriminate alive and dead cells.

In [None]:
thresholds = np.arange(0, 100000, 10000)
for experiment in experiments :
    experiment.ndead = np.zeros_like(thresholds)
    for i,threshold in enumerate(thresholds) :
        experiment.ndead[i] = (experiment.total_expression<threshold).sum()

In [None]:
for experiment in experiments :
    plt.plot(thresholds, experiment.ndead, label=experiment.sample_name)
plt.xlabel('Threshold')
plt.ylabel('Number of dead cells')
plt.legend()
plt.show()

In [None]:
threshold = 900000
for experiment in experiments :
    experiment.dead_cells = np.where(experiment.total_expression<threshold)[0]
    experiment.alive_cells = [i for i in range(96) if i not in dead_cells]

Next, I want to find out whether there is good correlation or not between the GFP expression state as characterized by the single-cell RNA-seq data and the fluorescence intensity. Let's load the data.

In [None]:
for experiment in experiments :
    experiment.GFP = np.genfromtxt(experiment.matrix_fname.replace('.tsv.gz', '.csv'),
                                  dtype=np.dtype([('name','S4'), ('expression','f')]))

In [None]:
for experiment in experiments :
    plt.scatter(np.log(experiment.expression[experiment.gene_to_idx['FILIONG01']]+1), np.log(experiment.GFP['expression']),
               label=experiment.sample_name)
plt.xlabel('Log RNA-seq expression')
plt.ylabel('GFP intensity')
plt.legend(loc='upper right')
plt.show()

Not the most spectacular correlation I've ever seen, we'll have to admit.

## Classification using UMAP

Now I want to use UMAP to perform classification of the data points.

In [None]:
embedding = umap.UMAP().fit_transform(experiment.expression.T)

In [None]:
plt.scatter(embedding[:6,0], embedding[:6,1], color='b', label='Jurkat')
plt.scatter(embedding[6:36,0], embedding[6:36,1], color='g', label='J-Lat+DMSO')
plt.scatter(embedding[36:96,0], embedding[36:96,1], color='k', label='J-Lat+SAHA')
plt.legend(loc='lower left')
plt.xlabel('UMAP component 1')
plt.ylabel('UMAP component 2')
plt.show()

Let's try to see plotting the dead versus alive cells.

In [None]:
plt.scatter(embedding[:6,0], embedding[:6,1], color='b', label='Jurkat')
plt.scatter(embedding[6:36,0], embedding[6:36,1], color='g', label='J-Lat+DMSO')
plt.scatter(embedding[alive_cells,0], embedding[alive_cells,1], color='r', label='J-Lat+SAHA alive')
plt.scatter(embedding[dead_cells,0], embedding[dead_cells,1], color='k', label='J-Lat+SAHA dead')
plt.legend(loc='lower left')
plt.xlabel('UMAP component 1')
plt.ylabel('UMAP component 2')
plt.show()

This gives the interesting result that the Jurkat and the latent untreated cells are in the same group. The dead cells all are in the same group. Now let's do better: I'll first get out of the way the dead cells, and then perform the classification.

In [None]:
alive = np.concatenate((experiments[0].expression[experiments[0].alive_cells],
                       experiments[1].expression[experiments[1].alive_cells]), axis=0)

Build labels so that we can then plot the data nicely.