In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os, sys, gzip
import pandas as pd
import umap

# 2018-09-19 Classification of cells

Building upon previous results. I want to do a proper classification of the cells based on their expression profiles. I want to generate a data structure that will allow me to plot and classify stuff without going nuts.

It's time to go to Pandas.

In [None]:
# define paths and file names
sc_hiv_rootdir = '%s/work/CRG/projects/sc_hiv'%(os.getenv('HOME'))
datadir = '%s/data'%(sc_hiv_rootdir)
matrices_dir = '%s/matrices'%(datadir)

In [None]:
class SsHIVExperiment :
    def __init__(self, sample_name) :
        
        # sample name
        self.sample_name = sample_name
        
        # expression matrix
        self.matrix_fname = '%s/%s.tsv.gz'%(matrices_dir, sample_name)
        # we use the read_csv function with option "index_col = 'gene'" so that we will
        # be able to classify the rows of the file according to the gene name
        self.matrix = pd.read_csv(self.matrix_fname,
                                  delimiter='\t',
                                  index_col='gene').transpose()

In [None]:
# parse the experiment matrices and put everything in a Pandas DataFrame
sample_names = ['P2449', 'P2458']
experiments = []
for sample_name in sample_names :
    experiment = SsHIVExperiment(sample_name)
    experiment.labels = pd.Series(['Jurkat']*6 + ['J-Lat+DMSO']*30 + ['J-Lat+SAHA']*60,
                                  index=experiment.matrix.index)
    experiments.append(experiment)

Next, I'll filter out the dead cells.

In [None]:
threshold = 100000.0
for experiment in experiments :
    experiment.alive_mask = experiment.matrix.sum(axis=1) > threshold
    experiment.alive = experiment.matrix.loc[experiment.alive_mask]
    experiment.alive_labels = experiment.labels[experiment.alive_mask]

Now we can do the clustering with UMAP having eliminated the dead cells.

In [None]:
for experiment in experiments :
    experiment.reducer = umap.UMAP()
    experiment.embedding = experiment.reducer.fit_transform(experiment.alive)

And let's look at the results.

In [None]:
def scatter_with_label(xy, labels, label, color) :
    mask = labels==label
    plt.scatter(xy[mask,0], xy[mask,1], c=color, label=label)

In [None]:
colors = {'Jurkat' : 'r',
          'J-Lat+DMSO' : 'b',
          'J-Lat+SAHA' : 'g'}
for experiment in experiments :
    fig = plt.figure()
    scatter_with_label(experiment.embedding, experiment.alive_labels, 'Jurkat', colors['Jurkat'])
    scatter_with_label(experiment.embedding, experiment.alive_labels, 'J-Lat+DMSO', colors['J-Lat+DMSO'])
    scatter_with_label(experiment.embedding, experiment.alive_labels, 'J-Lat+SAHA', colors['J-Lat+SAHA'])
    plt.title(experiment.sample_name, fontsize=32)
    plt.legend()
    fig.savefig('%s/figures/%s.png'%(sc_hiv_rootdir, experiment.sample_name))
    plt.show()

## Cell cycle
Now I'll try to factor in the information on the cell cycle. I downloaded a dataset of genes associated with a given cell cycle phase. Let's try to plot the cells with a color that corresponds to how likely that cell is in a given cell cycle phase.

In [None]:
# create a dictionary with all the genes listed
cell_cycle_genes = {}
with open('%s/data/Dominguez2016/cell_cycle_genes.csv'%(sc_hiv_rootdir), 'r') as f :
    for line in f :
        gene, phase = line.strip('\n').split()
        cell_cycle_genes[gene] = phase

In [None]:
# create a list of genes with their names without their splicing variants
condensedgenes = {g.split('.')[0] : g for g in experiment.matrix.columns}

In [None]:
# get the name of the phases
phases = set(cell_cycle_genes.values())

In [None]:
for experiment in experiments :
    experiment.phase_score = {}
    for phase in phases :
        experiment.phase_score[phase] = pd.Series(0, index=experiment.matrix.index)
    for gene, phase in cell_cycle_genes.iteritems() :
        if gene not in condensedgenes :
            continue
        experiment.phase_score[phase] += experiment.matrix[condensedgenes[gene]]

In [None]:
for experiment in experiments :
    experiment.total_expression = experiment.matrix.sum(axis=1)

In [None]:
for cell in experiment.matrix.iterrows() :
    cell_tot = experiment.total_expression[cell[0]]
    print '%s %.3f %.3f'%(cell[0], experiment.phase_score['G1-S'][cell[0]]/cell_tot,
                                   experiment.phase_score['G2-M'][cell[0]]/cell_tot)
    # experiment.phase_score['G1-S']

In [None]:
plt.scatter(experiment.phase_score['G1-S']/experiment.total_expression,
            experiment.phase_score['G2-M']/experiment.total_expression)