In [48]:
import json
import numpy as np
from pathlib import Path
from numba import jit
from datetime import datetime, timedelta
from collections import Counter
from typing import Union

from overprot.libs import lib_alignment
from overprot.libs import lib_pymol
from overprot.libs.lib_structure import Structure
from overprot.libs.lib import Timing

In [3]:
import tree_stuff as ts
n = 885
domains = np.array(ts.get_domains(n))
families = np.array([famdom.split('/')[1] for famdom in (ts.DATA / f'choice_{n}.txt').read_text().split()])
fam_counts = Counter(families)

In [108]:
def distance_badness(distance_matrix: Union[np.ndarray, Path], domains, families, verbose=False):
    n = len(domains)
    if isinstance(distance_matrix, np.ndarray):
        distances = distance_matrix
    else:
        distances = np.loadtxt(distance_matrix)
    distances_wo_self = distances.copy()
    distances_wo_self[range(n), range(n)] = np.inf
    # distances_wo_self[:5,:5]
    nns = [np.argmin(distances_wo_self[i]) for i in range(n)]
    good = []
    bad = []
    bad_T = []  # subset of bad, put in wrong Topology
    bad_A = []  # subset of bad_T, put in wrong Architecture
    bad_C = []  # subset of bad_A, put in wrong Class
    for i, nn in enumerate(nns):
        # print(domains[i], families[i], domains[nn], families[nn])
        if families[i] == families[nn]:
            good.append(i)
        else:
            bad.append(i)
            if families[i].split('.')[:3] != families[nn].split('.')[:3]:
                bad_T.append(i)
            if families[i].split('.')[:2] != families[nn].split('.')[:2]:
                bad_A.append(i)
            if families[i].split('.')[:1] != families[nn].split('.')[:1]:
                bad_C.append(i)
    if verbose:
        for i in bad:
            nn = nns[i]
            print(domains[i], domains[nn], f'{families[i]}   ({fam_counts[families[i]]})', f'{families[nn]}   ({fam_counts[families[nn]]})', sep='\t')
    return len(bad)/n, len(bad_T)/n, len(bad_A)/n, len(bad_C)/n, good, bad

versions = 'lengthmax lengthmin maxrmsd3 maxrmsd5 maxrmsd7 maxrmsd8 maxrmsd10 maxrmsd12 maxrmsd15 maxrmsd20 op-exp5 op-exp7 op-exp10 op-exp15 op-exp18 op-exp20 op-exp22 op-exp25 op-exp30 op-lin5 op-lin7 op-lin10 op-lin12 op-lin15 op-lin17 op-lin20 sop-maxrmsd7-lin15 sop-maxrmsd7-exp20 sop-maxrmsd7*lin15 sop-maxrmsd7*exp20'.split()
for version in versions:
    bH, bT, bA, bC, *_ = distance_badness(ts.DATA / f'distance_{n}x{n}-{version}.csv', domains, families)
    print(f'{version:20}', *('{:.4f}'.format(b) for b in (bH, bT, bA, bC)), sep='\t')
sum([count for fam, count in fam_counts.items() if count==1]), 3/885

lengthmax           	0.9955	0.9955	0.9955	0.7525
lengthmin           	0.8531	0.8045	0.7311	0.6000
maxrmsd3            	0.3322	0.3254	0.3153	0.2565
maxrmsd5            	0.1898	0.1785	0.1503	0.1096
maxrmsd7            	0.1672	0.1401	0.1119	0.0531
maxrmsd8            	0.1661	0.1379	0.1073	0.0452
maxrmsd10           	0.1718	0.1345	0.1073	0.0395
maxrmsd12           	0.1864	0.1492	0.1164	0.0362
maxrmsd15           	0.2215	0.1774	0.1367	0.0463
maxrmsd20           	0.2565	0.2113	0.1582	0.0554
op-exp5             	0.1164	0.0904	0.0757	0.0644
op-exp7             	0.0927	0.0644	0.0486	0.0373
op-exp10            	0.0712	0.0452	0.0294	0.0192
op-exp15            	0.0667	0.0395	0.0249	0.0147
op-exp18            	0.0644	0.0384	0.0249	0.0124
op-exp20            	0.0633	0.0362	0.0226	0.0113
op-exp22            	0.0633	0.0362	0.0226	0.0090
op-exp25            	0.0667	0.0384	0.0282	0.0124
op-exp30            	0.0723	0.0384	0.0316	0.0147
op-lin5             	0.1729	0.1571	0.1469	0.1232
op-lin7             

(3, 0.003389830508474576)

In [102]:
sw = 0.5
mixed_distance = (sw * np.loadtxt(ts.DATA / f'distance_{n}x{n}-maxrmsd7.csv') + (1-sw) * np.loadtxt(ts.DATA / f'distance_{n}x{n}-op-lin10.csv'))
bH, bT, bA, bC, *_ = distance_badness(mixed_distance, domains, families)
print(*('{:.4f}'.format(b) for b in (bH, bT, bA, bC)), sep='\t')

mixed_distance = (sw * np.loadtxt(ts.DATA / f'distance_{n}x{n}-maxrmsd7.csv') + (1-sw) * np.loadtxt(ts.DATA / f'distance_{n}x{n}-op-lin15.csv'))
bH, bT, bA, bC, *_ = distance_badness(mixed_distance, domains, families)
print(*('{:.4f}'.format(b) for b in (bH, bT, bA, bC)), sep='\t')

mixed_distance = (sw * np.loadtxt(ts.DATA / f'distance_{n}x{n}-maxrmsd7.csv') + (1-sw) * np.loadtxt(ts.DATA / f'distance_{n}x{n}-op-exp20.csv'))
bH, bT, bA, bC, *_ = distance_badness(mixed_distance, domains, families)
print(*('{:.4f}'.format(b) for b in (bH, bT, bA, bC)), sep='\t')

mixed_distance = (sw * np.loadtxt(ts.DATA / f'distance_{n}x{n}-maxrmsd7.csv') + (1-sw) * np.loadtxt(ts.DATA / f'distance_{n}x{n}-op-exp22.csv'))
bH, bT, bA, bC, *_ = distance_badness(mixed_distance, domains, families)
print(*('{:.4f}'.format(b) for b in (bH, bT, bA, bC)), sep='\t')

0.0723	0.0475	0.0350	0.0181
0.0701	0.0395	0.0271	0.0147
0.0746	0.0463	0.0282	0.0068
0.0780	0.0486	0.0294	0.0068


In [62]:
# distance_badness(ts.DATA / f'distance_{n}x{n}-maxrmsd7.csv', domains, families, verbose=True)
# print('-'*50)
distance_badness(mixed_distance, domains, families, verbose=True)
None

2m8eA00	2r0qC02	1.10.10.10   (15)	1.10.10.60   (7)
2o3fA00	1jtxA01	1.10.10.10   (15)	1.10.10.60   (7)
4q5sF02	1jtxA01	1.10.10.10   (15)	1.10.10.60   (7)
1d0kA02	1xydA00	1.10.530.10   (11)	1.10.238.10   (8)
3qjkA00	4q5sF02	1.10.238.10   (8)	1.10.10.10   (15)
1jtxA01	4q5sF02	1.10.10.60   (7)	1.10.10.10   (15)
1tf0A01	1zqzA01	1.10.246.10   (1)	1.10.150.20   (12)
3proA01	1bibA03	2.40.10.10   (26)	2.30.30.100   (2)
5t6fA01	3uytA01	2.40.10.10   (26)	3.30.200.20   (49)
1pfsA00	1cskA00	2.40.50.140   (10)	2.30.30.40   (5)
1vciA03	1ckaA00	2.40.50.140   (10)	2.30.30.40   (5)
5ffhA00	5gioE01	2.40.128.20   (7)	3.30.200.20   (49)
1q43A02	1jtxA01	2.60.120.10   (7)	1.10.10.60   (7)
1bhgA01	3zoaA03	2.60.120.260   (6)	2.60.40.1180   (8)
6aj0B00	2opkA01	2.60.120.20   (3)	2.60.120.10   (7)
1bibA03	2j6oA00	2.30.30.100   (2)	2.30.30.40   (5)
5gioE01	1nziA02	3.30.200.20   (49)	2.10.25.10   (5)
3iimA00	1ffsA00	3.40.50.300   (38)	3.40.50.2300   (9)
3o8cA03	4hktA01	3.40.50.300   (38)	3.40.50.720   (36)
5j40A03	

In [64]:
distance = mixed_distance
i_5gio = np.where(domains=='5gioE01')[0][0]
print(i_5gio)
hits = [f'{d:.3f}\t{dom}\t{fam}' for d, dom, fam in sorted(zip(distance[i_5gio], domains, families))]
print(*hits[:20], sep='\n')
np.argmin(distance[i_5gio])
# np.min(distance[0])
nans = np.where(np.isnan(distance[0]))
nans, domains[nans], families[nans]

424
0.000	5gioE01	3.30.200.20
22.064	1nziA02	2.10.25.10
24.788	2j6oA00	2.30.30.40
25.092	1ckaA00	2.30.30.40
25.325	1cskA00	2.30.30.40
26.092	1iqgL00	2.10.25.10
26.290	1bibA03	2.30.30.100
26.527	2phbB00	2.10.25.10
26.659	2r3gA01	3.30.200.20
26.828	2bq6A00	2.10.25.10
26.984	2krnA00	2.30.30.40
27.135	4q5eA01	3.30.200.20
27.431	5y20A00	3.30.40.10
27.944	4fktA01	3.30.200.20
28.287	3uytA01	3.30.200.20
29.193	1y8yA01	3.30.200.20
29.243	5hgiA01	3.30.200.20
29.391	1vciA03	2.40.50.140
29.562	6hm6A01	3.30.200.20
29.632	2evaA01	3.30.200.20


((array([], dtype=int64),), array([], dtype='<U7'), array([], dtype='<U13'))

In [67]:
rotations = np.load(ts.DATA / 'rotations_20x20-op-lin10.npy')
translations = np.load(ts.DATA / 'translations_20x20-op-lin10.npy')
rotations.shape, translations.shape, rotations.max(), rotations.min(), translations.max(), translations.min()

((20, 20, 3, 3),
 (20, 20, 1, 3),
 1.0000000000000004,
 -0.9999609197964008,
 21.194041055790912,
 -23.839824719511864)

In [14]:
def seconds(s): return timedelta(seconds=s)

In [18]:
print(seconds(25) / 200**2 * 6565**2)
print(seconds(363) / 1000**2 * 6565**2)

7:28:57.015625
4:20:45.018675


In [59]:
print(seconds(25) /100**2 *885**2)
1378/7280*100, 3170/7280*100
885-388
print(timedelta(minutes=12) /885**2 *6565**2)
# distance matrix 200x200 (shapedist-maxrmsd7): 51 s
# distance matrix 200x200 (shapedist-maxrmsd7): 47 s (precomputed shapes)
# distance matrix 200x200 (shapedist-maxrmsd7): 42 s (precomputed shapes, omitting center point of the shape)
# distance matrix 200x200 (op-r10): 12:25
# distance matrix 885x885 (op-r10): 2:56:00

0:32:38.062500
11:00:08.187775


In [84]:
nd = 885
n_calc = n * (n+1) // 2
print('shapedist', timedelta(minutes=12) / n_calc *6565)
print('opdist', timedelta(hours=2, minutes=56) / n_calc *6565)

shapedist 0:00:12.053340
opdist 0:02:56.828275


In [28]:
op-exp20  	0.0633	0.0362	0.0226	0.0113
op-exp22  	0.0633	0.0362	0.0226	0.0090
op-lin15  	0.0610	0.0362	0.0237	0.0158
op-lin17  	0.0633	0.0373	0.0237	0.0147

[  0   1   2   3   4 100   6   7   8   9]
[[  0   1   2   3   4 100   6   7   8   9]]
