Take Mem (you can combine MemA and MemG) and Naive, and filter per unique junctions.
Divide the Mem in groups per mutation frequency, i.e. 1% interval and take only unmutated for Naive.
For each subset calculate and plot the distance to the nearest sequence within each group (mutation interval) and between sequences from different groups.
Do the same intra and inter donors.

i.e.
- distance between seqs within B4 Naive 0% mut group
- distance between seqs within B4 Mem 0-1% mut group
- distance between seqs within B4 Mem 1-2% mut group
- ...
- distance between seqs from B4 Mem 0-1% and 1-2%
- distance between seqs from B4 Mem 0-1% and 2-3%
- ...
- distance between seqs from B4 Naive and B5 Naive
- distance between seqs from B4 Mem 0-1% and B5 Mem 0-1%
- distance between seqs from B4 Mem 1-2% and B5 Mem 1-2%
- ...

The results should give us a baseline distance and an idea about how it change in function of the mutation level.

In [27]:
%matplotlib inline
import newDefineClones as cl
import DbCore
import re, sys
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn

from itertools import product

sys.path.insert(0,'/home/fede/Dropbox/projects/ig_network')
import parallel_distance as pd
reload(pd)
sys.path.pop(0)


'/home/fede/Dropbox/projects/ig_network'

In [3]:
def remove_duplicate_junctions(l):
    igs, juncs = [], []
    for el in l:
        if not el.junction in juncs:
            juncs.append(re.sub('[\.-]','N', str(el.junction)))
            igs.append(el)
    return igs, juncs

def calcDist(el1, el2, mut=[]):
    #consider ham model
    return DbCore.calcSingleDistance(el1, el2, 1, cl.ham_model, 'min', cl.default_sym, mut, False)

In [4]:
in_file = DbCore.readDbFile('../new_seqs/B4_db-pass.tab_CON-FUN-N_new_ord-MUT.tab')
all_seqs = [f for f in in_file]
naive = [f for f in all_seqs if (f.subset=='N' and f.mut==0)]
mem = [f for f in all_seqs if (f.subset in ['MemA','MemG'])]

In [5]:
naive_filtered, naive_filtered_junctions = remove_duplicate_junctions(naive)
mem_filtered, mem_filtered_junctions = remove_duplicate_junctions(mem)

In [None]:
print("Computing distance matrix for naive ...")
d_arr = distance_matrix_parallel(naive_filtered_junctions, calcDist, condensed=False)

In [None]:
MM = d_arr + np.eye(d_arr.shape[0])
MM
np.save("dist2nearest_plots/distances_B4_naive_norm", d_arr)
np.savetxt("dist2nearest_plots/distances_B4_naive_norm.csv", d_arr,delimiter=',')

In [None]:
dist2nearest = np.array([np.min(r) for r in MM])
f = plt.figure(figsize=(20,10))
h = plt.hist(dist2nearest, bins=100)
plt.title("(Normalised) distances between Naive B4");
plt.xlabel("Ham distance normalised")
plt.xticks(np.linspace(0,.5,11))
plt.ylabel("Count")
plt.savefig("dist2nearest_plots/distances_B4_naive_norm")

In [None]:
# old
def filter_ig(igs, lim_mut, type_ig='Mem', donor='B4', bins=30):
    junctions = [re.sub('[\.-]','N', str(ig.junction)) for ig in igs if lim_mut[0] <= ig.mut < lim_mut[1]]
    X = pdc.distance_matrix_parallel(junctions, calcDist, condensed=False)
    M = X + np.eye(X.shape[0])
    np.save("dist2nearest_plots/distances_B4_naive_norm", X)
    np.savetxt("dist2nearest_plots/distances_B4_naive_norm.csv", X, delimiter=',')
    dist2nearest = np.array([np.min(r) for r in M])
    f = plt.figure()
    h = plt.hist(dist2nearest, bins=bins)
    plt.title("Distances between "+type_ig+" "+str(lim_mut[0])+"-"+str(lim_mut[1])+"%");
    plt.ylabel('Count')
    plt.xticks(np.linspace(0,1,21))
    plt.xlabel('Ham distance')
    plt.savefig("distances_"+donor+'_'+type_ig.lower()+'_'+str(lim_mut[0])+'-'+str(lim_mut[1]))
    plt.close()
    return f

In [25]:
def filter_2(igs, lim_mut1, lim_mut2, type_ig='Mem', donor='B4', bins=100):
    juncs1 = [re.sub('[\.-]','N', str(ig.junction)) for ig in igs if lim_mut1[0] <= ig.mut < lim_mut1[1]]
    juncs2 = [re.sub('[\.-]','N', str(ig.junction)) for ig in igs if lim_mut2[0] <= ig.mut < lim_mut2[1]]
    X = pd.dense_dm_dual(juncs1, juncs2, calcDist, condensed=False)
    print X
    fn = "dist2nearest_plots_tmp/distances_"+donor+'_'+type_ig.lower()+'_'+str(lim_mut1[0])+'-'+str(lim_mut1[1])+'_vs_'+str(lim_mut2[0])+'-'+str(lim_mut2[1])+"_norm"
    np.save(fn, X)
    dist2nearest = np.array([np.min(r[r>0]) for r in X])
    f = plt.figure(figsize=(20,10))
    plt.hist(dist2nearest, bins=bins)
    plt.title("Distances between "+type_ig+" "+str(lim_mut1[0])+"-"+str(lim_mut1[1])+"% and "+str(lim_mut2[0])+"-"+str(lim_mut2[1])+"%");
    plt.ylabel('Count')
    plt.xticks(np.linspace(0,1,21))
    plt.xlabel('Ham distance (normalised)')
    plt.savefig(fn)
    plt.close()
    return f

In [None]:
# sets = [(x, x+1) for x in range(int(max([m.mut for m in mem_filtered if m.mut < 25])))]
# sets.append((25,29))
# combinations = [x for x in product(sets, sets) if x[0][0] <= x[1][0]]
# for i, j in combinations:
#     f = filter_2(mem_filtered, i, j, type_ig='Mem', donor='B4', bins=100)

In [7]:
sets = [(x, x+1) for x in range(int(max([m.mut for m in mem_filtered if m.mut < 24])))]
sets.append((24,29))
sets1 = [(0,i) for i in range(1, 24)]
sets1.append((0,29))
combinations = [x for x in zip(sets1, sets)][1:]
combinations

[((0, 2), (1, 2)),
 ((0, 3), (2, 3)),
 ((0, 4), (3, 4)),
 ((0, 5), (4, 5)),
 ((0, 6), (5, 6)),
 ((0, 7), (6, 7)),
 ((0, 8), (7, 8)),
 ((0, 9), (8, 9)),
 ((0, 10), (9, 10)),
 ((0, 11), (10, 11)),
 ((0, 12), (11, 12)),
 ((0, 13), (12, 13)),
 ((0, 14), (13, 14)),
 ((0, 15), (14, 15)),
 ((0, 16), (15, 16)),
 ((0, 17), (16, 17)),
 ((0, 18), (17, 18)),
 ((0, 19), (18, 19)),
 ((0, 20), (19, 20)),
 ((0, 21), (20, 21)),
 ((0, 22), (21, 22)),
 ((0, 23), (22, 23)),
 ((0, 29), (24, 29))]

In [28]:
for i, j in combinations:
    f = filter_2(mem_filtered, i, j, type_ig='Mem', donor='B4', bins=50)

[[ 0.          0.5625      0.20833333 ...,  0.5952381   0.70175439
   0.66666667]
 [ 0.5625      0.          0.4375     ...,  0.57142857  0.6875      0.54166667]
 [ 0.20833333  0.4375      0.         ...,  0.64285714  0.6875      0.5625    ]
 ..., 
 [ 0.43859649  0.45833333  0.45833333 ...,  0.47619048  0.70175439
   0.58333333]
 [ 0.6875      0.52083333  0.66666667 ...,  0.47619048  0.5625      0.625     ]
 [ 0.66666667  0.5         0.64583333 ...,  0.5         0.58333333
   0.60416667]]
[1m0 / 942[0m [                                       ] 

Exit signal received
Terminating processes ...... done.
Process Process-130:
Process Process-132:
Process Process-136:
Process Process-137:
Process Process-133:
Process Process-142:
Process Process-139:
Process Process-141:
Process Process-143:
Process Process-140:
Process Process-138:
Process Process-134:
Process Process-131:
Process Process-144:
Process Process-135:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2

SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.


### B5

In [None]:
in_file_b5 = DbCore.readDbFile('../new_seqs/B5_db-pass.tab_CON-FUN-N.tab')
all_seqs_b5 = [f for f in in_file_b5]
naive_b5 = [f for f in all_seqs_b5 if (f.subset=='N' and f.mut==0)]
mem_b5 = [f for f in all_seqs_b5 if (f.subset in ['MemA','MemG'])]

naive_filtered_b5, naive_filtered_junctions_b5 = remove_duplicate_junctions(naive_b5)
mem_filtered_b5, mem_filtered_junctions_b5 = remove_duplicate_junctions(mem_b5)

In [None]:
X = pdc2.distance_matrix_parallel(naive_filtered_junctions, naive_filtered_junctions_b5, calcDist,condensed=False)

In [None]:
# naive
M = X + np.eye(X.shape[0]) if X.shape[0] == X.shape[1] and X[0,0] == 0 else X
dist2nearest = np.array([np.min(r) for r in M])
np.save("dist2nearest_plots/distances_B4_B5_naive_norm", X)
np.savetxt("dist2nearest_plots/distances_B4_B5_naive_norm.csv", X,delimiter=',')
f = plt.figure(figsize=(20,10))
plt.hist(dist2nearest, bins=100)
plt.title("Distances between B4-B5 Naive");
plt.ylabel('Count')
plt.xlabel('Ham distance (normalised)')
plt.savefig("dist2nearest_plots/distances_B4_B5_naive_norm")

In [None]:
def filter_3(igs1, igs2, lim_mut1, lim_mut2, type_ig='Mem', donor1='B4', donor2='B5', bins=100):
    juncs1 = [re.sub('[\.-]','N', str(ig.junction)) for ig in igs1 if lim_mut1[0] <= ig.mut < lim_mut1[1]]
    juncs2 = [re.sub('[\.-]','N', str(ig.junction)) for ig in igs2 if lim_mut2[0] <= ig.mut < lim_mut2[1]]
    X = pdc2.distance_matrix_parallel(juncs1, juncs2, calcDist, condensed=False)
    fn = "dist2nearest_plots/distances_"+donor1+'_'+donor2+'_'+type_ig.lower()+'_'+str(lim_mut1[0])+'-'+str(lim_mut1[1])+'_vs_'+str(lim_mut2[0])+'-'+str(lim_mut2[1])+"_norm"
    np.save(fn, X)
    np.savetxt(fn+'.csv', X, delimiter=',')
    M = X + np.eye(X.shape[0]) if X.shape[0] == X.shape[1] and X[0,0] == 0 else X
    f = plt.figure(figsize=(20,10))
    dist2nearest = np.array([np.min(r) for r in M])
    plt.hist(dist2nearest, bins=bins)
    plt.title("Distances between "+donor1+'-'+donor2+' '+type_ig+" "+str(lim_mut1[0])+"-"+str(lim_mut1[1])+"% and "+str(lim_mut2[0])+"-"+str(lim_mut2[1])+"%");
    plt.ylabel('Count')
    plt.xticks(np.linspace(0,1,21))
    plt.xlabel('Ham distance (normalised)')
    plt.savefig(fn)
    plt.close()
    return f

In [None]:
# B4 and B5 same mut level
combinations = [x for x in product(sets, sets) if x[0][0] == x[1][0]]
for i, j in combinations:
    f = filter_3(mem_filtered, mem_filtered_b5, i, j, type_ig='Mem', donor1='B4', donor2='B5', bins=100)
    break