Get pair-wise correlations between all deeply sequenced condensability scores in 1kb resolution

In [1]:
# python modules
import random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
from scipy import stats


In [2]:
# custom modules
import graphics_edit as graphics

In [3]:
# matplotlib setting
%matplotlib inline
mpl.rcParams["figure.facecolor"] = "white"
mpl.rcParams["axes.facecolor"] = "white"
mpl.rcParams["savefig.facecolor"] = "white"

In [4]:
# custom modules
import load_file_edit as load_file

In [5]:
path = "./data/"

In [6]:
### parameters
cell_org = {'H1':'human',
            'GM':'human',
            'mCD8T:WT':'mouse',
            'mCD8T:DFMO':'mouse',
            'mCD8T:ODCKO':'mouse'}

cell_chrnames = {'H1':['chr%s' % (i) for i in range(1, 23)] + ['chrX', 'chrY'],
                 'GM':['chr%s' % (i) for i in range(1, 23)] + ['chrX'],
                 'mCD8T:WT':['chr%s' % (i) for i in range(1, 20)] + ['chrX'],
                 'mCD8T:DFMO':['chr%s' % (i) for i in range(1, 20)] + ['chrX'],
                 'mCD8T:ODCKO':['chr%s' % (i) for i in range(1, 20)] + ['chrX']}

In [7]:
### exp list
### exp = (rep, cell, sample, agent, tnum)
exp_list = [(1, 'H1', 'NCP', 'sp', 8),
            (2, 'H1', 'NCP', 'sp', 8),
            (1, 'H1', 'NCP', 'HP1a', 3),
            (2, 'H1', 'NCP', 'HP1a', 3),
            (1, 'H1', 'DNA', 'HP1a', 3),
            (2, 'H1', 'DNA', 'HP1a', 3),
            (1, 'GM', 'NCP', 'sp', 8),
            (2, 'GM', 'NCP', 'sp', 8)]

depth = 'deep'
bin_size = 1000
dtype = 'score'


In [13]:
### read data
exp_ID_score = {}
for exp in exp_list:
    rep, cell, sample, agent, tnum = exp
    
    fname = '_'.join(['GSE252941',
                      cell,
                      sample,
                      agent,
                      str(rep) + 'rep',
                      depth,
                      str(int(bin_size/1000.0)) + 'kb',
                      dtype]) + '.cn'

    field_name = '_'.join([cell,
                           sample,
                           agent,
                           str(tnum),
                           str(rep) + 'rep',
                           depth])

    chr_choices = cell_chrnames[cell]
        
    ID_score = load_file.read_cn_file(path + fname,
                                      mode='col',
                                      field_choices=[field_name],
                                      chr_choices=chr_choices)[field_name]
    exp_ID_score[exp] = ID_score


In [15]:
### get common IDs
ID_list = set([])
for i in range(len(exp_list)):
    exp = exp_list[i]
    if i == 0:
        ID_list |= set(exp_ID_score[exp].keys())
        continue
    ID_list &= set(exp_ID_score[exp].keys())
ID_list = sorted(list(ID_list))
    

In [16]:
### get correlation between experiments
pair_corr = {}
for i in range(len(exp_list)-1):
    for j in range(i+1, len(exp_list)):
        exp1 = exp_list[i]
        exp2 = exp_list[j]

        X, Y = [], []
        for ID in ID_list:
            X.append(exp_ID_score[exp1][ID])
            Y.append(exp_ID_score[exp2][ID])

        corr = scipy.stats.spearmanr(X, Y)[0]
        #corr = scipy.stats.pearsonr(X, Y)[0]
        print "%d-%s-%s-%s-%d" % exp1
        print "%d-%s-%s-%s-%d" % exp2
        print "%1.2f" % (corr)
        print 
        #print ("%s VS %s: %1.2f" % (agent1, agent2, corr))
            
        fig = plt.figure()
        plt.plot(X, Y, '.')
        plt.annotate("Spearman %1.2f" % (corr),
                     xy=(0.2, 0.75),
                     fontsize=12,
                     xycoords='axes fraction')
        #plt.title("%s VS %s" % (agent, agent2))
        plt.xlabel("fold change (%d-%s-%s-%s-%d)" % exp1)
        plt.ylabel("fold change (%d-%s-%s-%s-%d)" % exp2)
        #plt.xscale('log', base=2)
        #plt.yscale('log', base=2)
        #plt.show()
        plt.close()

        pair_corr[(exp1, exp2)] = corr

1-H1-NCP-sp-8
2-H1-NCP-sp-8
0.79

1-H1-NCP-sp-8
1-H1-NCP-HP1a-3
0.25

1-H1-NCP-sp-8
2-H1-NCP-HP1a-3
0.18

1-H1-NCP-sp-8
1-H1-DNA-HP1a-3
0.35

1-H1-NCP-sp-8
2-H1-DNA-HP1a-3
0.37

1-H1-NCP-sp-8
1-GM-NCP-sp-8
0.38

1-H1-NCP-sp-8
2-GM-NCP-sp-8
0.37

2-H1-NCP-sp-8
1-H1-NCP-HP1a-3
0.28

2-H1-NCP-sp-8
2-H1-NCP-HP1a-3
0.24

2-H1-NCP-sp-8
1-H1-DNA-HP1a-3
0.30

2-H1-NCP-sp-8
2-H1-DNA-HP1a-3
0.29

2-H1-NCP-sp-8
1-GM-NCP-sp-8
0.39

2-H1-NCP-sp-8
2-GM-NCP-sp-8
0.40

1-H1-NCP-HP1a-3
2-H1-NCP-HP1a-3
0.88

1-H1-NCP-HP1a-3
1-H1-DNA-HP1a-3
0.01

1-H1-NCP-HP1a-3
2-H1-DNA-HP1a-3
-0.03

1-H1-NCP-HP1a-3
1-GM-NCP-sp-8
0.32

1-H1-NCP-HP1a-3
2-GM-NCP-sp-8
0.31

2-H1-NCP-HP1a-3
1-H1-DNA-HP1a-3
-0.07

2-H1-NCP-HP1a-3
2-H1-DNA-HP1a-3
-0.16

2-H1-NCP-HP1a-3
1-GM-NCP-sp-8
0.29

2-H1-NCP-HP1a-3
2-GM-NCP-sp-8
0.28

1-H1-DNA-HP1a-3
2-H1-DNA-HP1a-3
0.72

1-H1-DNA-HP1a-3
1-GM-NCP-sp-8
0.26

1-H1-DNA-HP1a-3
2-GM-NCP-sp-8
0.27

2-H1-DNA-HP1a-3
1-GM-NCP-sp-8
0.25

2-H1-DNA-HP1a-3
2-GM-NCP-sp-8
0.24

1-GM-NCP-sp-8
2-GM-NCP-

In [17]:
### reorganize data and make labels
exp_label = {}
exp_data = {}
for exp in exp_list:
    rep, cell, sample, agent, tnum = exp
    label = '%s %s %s\nid:%d %d-rep' % (cell, sample, agent, tnum, rep)
    exp_label[exp] = label
    data = [exp_ID_score[exp][ID] for ID in ID_list]
    exp_data[exp] = data
    


In [None]:
# plot correlation matrix between experiment
graphics.plot_corr_matrix(exp_data,
                          exp_label,
                          ids = exp_list,
                          scatter_style='density',
                          bins=100,
                          cbar=True,
                          save=True,
                          title = "Correlation betwen experiments (1 kb bin)",
                          note='1kb')
