In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os, gzip, sys
import pandas as pd

# 2018-09-06 First analysis
Guillaume passed me the links to the data in the single-cell HIV expression experiments. Now I want to open up the files and have a first look at what the data looks like.

In [None]:
# define paths and file names
sc_hiv_rootdir = '%s/work/CRG/projects/sc_hiv'%(os.getenv('HOME'))
datadir = '%s/data'%(sc_hiv_rootdir)
matrices_dir = '%s/matrices'%(datadir)

The convenient way of storing the information is with a class, so we can easily keep the data together that pertains to the same experiment.

In [None]:
class SsHIVExperiment :
    def __init__(self, sample_name) :
        self.sample_name = sample_name
        self.matrix_fname = '%s/%s.tsv.gz'%(matrices_dir, sample_name)

Now look at the `matrices_dir` directory and extract the names of the samples. I'll also load the data into a Pandas data frame.

In [None]:
# matrices
matrix_fnames = os.listdir(matrices_dir)
experiments = []
for matrix_fname in matrix_fnames :
    sample_name = matrix_fname.strip('.tsv.gz')
    experiment = SsHIVExperiment(sample_name)
    experiment.expression = pd.DataFrame.from_csv(experiment.matrix_fname, sep='\t', header=0)
    experiments.append(experiment)

Now that I have the data loaded, I can have a quick look at what the matrices look like.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,10))
for i, experiment in enumerate(experiments) :
    axes[i].matshow(np.log(1+experiment.expression.as_matrix()), aspect='auto', cmap=plt.cm.Greys)
    axes[i].set_title(experiment.sample_name, fontsize=32)

Let's measure a very simple thing: the correlation coefficient between the two matrices.

In [None]:
print np.corrcoef(experiments[0].expression.as_matrix().flatten(),
                  experiments[1].expression.as_matrix().flatten())[0,1]**2

So there is a considerable degree of correlation between the two samples. Probably that's due to housekeeping genes and genes that are repressed all the time.