In [1]:
from fbs.bpfbs import *

In [2]:
import pickle

fn_binary = 'pickled_df_count_slices.pkl'
with open(fn_binary, 'rb') as f:
    df = pickle.load(f)

df.head()

Unnamed: 0,size,count,relfreq,pdb_code,b
1305030,25,31,6.5e-05,2vda,1011110111101111011110111
1304751,25,29,6e-05,2lfs,1011110111101111011110111
1304689,25,23,4.8e-05,2kz1,1011110111101111011110111
1305167,25,21,4.4e-05,6h5h,1011110111101111011110111
1304740,25,20,4.2e-05,2ld4,1011110111101111011110111


In [6]:
df['type'] = df['b'].apply(lambda x: type(x).__name__)

df[['type','count']].groupby('type').count()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
str,2080220


In [3]:
# generate a sample of training-testing pbd binary data
from sklearn.model_selection import train_test_split

seed = 42

# selecting the train-test sample based on the choosen pdb_codes
pdb_train, pdb_test = train_test_split(df['pdb_code'].unique(), train_size=0.8, random_state=seed)
df_train = df[df['pdb_code'].isin(pdb_train)]

# once the sample was choosen, we need to group the identical binaries present in different pdb instances,
# summing up their absolute frequency (count) and their relative frequency (relfreq).
df_train = df_train.groupby(['b', 'size']).agg({'count': 'sum', 'relfreq': 'sum'}).reset_index()
df_train.sort_values(by=['size', 'count'], inplace=True, ascending=False)
df_train = df_train[['size', 'count', 'relfreq', 'b']]

states = list(df_train['b'])
p = list(df_train['relfreq'])


# adding missing binary statis of length 5 if it is necessary.
# it also adds the relative frequencies of the missing states as the uniform distribution
# in order to the sum of all frequencies be 1.
from itertools import product

states_len_5 = []
p_len_5 = []
for i in range(len(states)):
    s = states[i]
    prob = p[i]
    if len(s) == 5:
        states_len_5.append(s)
        p_len_5.append(prob)

if len(states_len_5) < 32:
    all_bin_len_5 = set(product(range(2), repeat=5))
    
    missing_states_len_5 = list(all_bin_len_5 - set(states_len_5))
    
    p_len_5 = np.array(p_len_5)
    missing_prob_uniform = (float(1 - np.sum(p_len_5)))/float(len(missing_states_len_5))
    
    for i in range(len(missing_states_len_5)):
        states.append(missing_states_len_5[i])
        p.append(missing_prob_uniform)

    

print('states: ', states[:5])
print('p: ', p[:5])

fn = 'pickled_states.pkl'
with open(fn, 'wb') as f:
    pickle.dump((states,p), f)


# df_la = df.groupby(['b', 'size']).agg({'count': 'sum', 'relfreq': 'sum'}).reset_index()
# df_la.sort_values(by=['size', 'count'], inplace=True, ascending=False)
# df_la = df_la[['size', 'count', 'relfreq', 'b']]

states:  ['1011110111101111011110111', '1101110111101111011110111', '1011111011101111011110111', '1101111011110111101111011', '1011110111100111011110111']
p:  [0.0048952141005486555, 0.001796924614535725, 0.0016532539327246416, 0.0016407608299584603, 0.0015720487647444639]


In [20]:
import pickle

with open('pickled_binary.pkl', 'rb') as f:
    wd_binary = pickle.load(f)

with open('pickled_original_sol.pkl', 'rb') as f:
    wd_original_sol = pickle.load(f)

In [21]:
# generate tests when the binary and xsol information come from pickled files
import pandas as pd
from tqdm import tqdm

def generate_tests(pdb_test):    
    M = {'fn':[], 'order': [], 'parents': [], 'xsol':[]}
    
    pdb_test = set(pdb_test)
    for key in tqdm(wd_binary):        
        
        fn_binary = key.replace('binary\\','')
        
        pdb_code = fn_binary.split('_')[0]

        if not pdb_code in pdb_test:
            continue
        
        df = wd_binary[key]
        
        df.fillna(-1, inplace=True)

        # convert from b (binary) to s (string)
        M['fn'].append(fn_binary)
        M['order'].append(df['order'].values)
        parents = zip(df['N_1'].astype(int).values, df['N_2'].astype(int).values, df['N_3'].astype(int).values)
        # convert parents to list
        M['parents'].append([list(p) for p in parents])

        fn_xsol = 'original_sol\\' + fn_binary.replace('_binary.csv','.sol')
        df = wd_original_sol[fn_xsol]
        xsol = [[float(x) for s in df.columns for x in s.split()]]
        xsol += [[float(x) for x in s[0].split()] for s in df.values]
        xsol = np.array(xsol)
        M['xsol'].append(xsol)

    M = pd.DataFrame(M)

    return M

M = generate_tests(pdb_test)
M.head()

100%|██████████| 73649/73649 [00:23<00:00, 3165.98it/s]


Unnamed: 0,fn,order,parents,xsol
0,1a03_model1_chainA_segment0_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-12.449, 7.053, 5.066], [-11.098, 6.511, 5.1..."
1,1a03_model1_chainA_segment1_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-11.299, -7.574, 2.87], [-11.467, -8.365, 1...."
2,1a03_model1_chainA_segment2_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-10.069, -18.068, 5.889], [-9.382, -16.777, ..."
3,1a03_model1_chainA_segment3_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-7.242, -12.744, -21.486], [-7.716, -11.455,..."
4,1a03_model1_chainA_segment4_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[1.536, -0.32, -6.37], [1.878, -0.516, -7.772..."


In [5]:
# generate tests when the binary and xsol information come from directories
import numpy as np
import pandas as pd
from tqdm import tqdm

wd_binary = 'binary'
wd_original_sol = 'original_sol'

def generate_tests(pdb_test):    
    M = {'fn':[], 'order': [], 'parents': [], 'xsol':[]}
    
    pdb_test = set(pdb_test)
    for fn_binary in tqdm(os.listdir(wd_binary)):
        pdb_code = fn_binary.split('_')[0]
        
        if not pdb_code in pdb_test:
            continue

        fn_binary = os.path.join(wd_binary, fn_binary)
        df = pd.read_csv(fn_binary)
        
        df.fillna(-1, inplace=True)

        # convert from b (binary) to s (string)
        M['fn'].append(fn_binary)
        M['order'].append(df['order'].values)
        parents = zip(df['N_1'].astype(int).values, df['N_2'].astype(int).values, df['N_3'].astype(int).values)
        # convert parents to list
        M['parents'].append([list(p) for p in parents])

        fn_xsol = os.path.basename(fn_binary).replace('_binary.csv', '.sol')
        fn_xsol = os.path.join(wd_original_sol, fn_xsol)

        M['xsol'].append(np.loadtxt(fn_xsol))

    M = pd.DataFrame(M)

    return M

M = generate_tests(pdb_test)
M.head()

100%|██████████| 73649/73649 [04:35<00:00, 267.16it/s]


Unnamed: 0,fn,order,parents,xsol
0,binary\1a03_model1_chainA_segment0_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-12.449, 7.053, 5.066], [-11.098, 6.511, 5.1..."
1,binary\1a03_model1_chainA_segment1_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-11.299, -7.574, 2.87], [-11.467, -8.365, 1...."
2,binary\1a03_model1_chainA_segment2_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-10.069, -18.068, 5.889], [-9.382, -16.777, ..."
3,binary\1a03_model1_chainA_segment3_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[-7.242, -12.744, -21.486], [-7.716, -11.455,..."
4,binary\1a03_model1_chainA_segment4_binary.csv,"[0, 1, 3, 2, 7, 4, 5, 8, 6, 12, 9, 10, 13, 11,...","[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [0,...","[[1.536, -0.32, -6.37], [1.878, -0.516, -7.772..."


In [6]:
import plotly.express as px

M['size'] = M['xsol'].apply(lambda x: x.shape[0])
df_ = M.groupby('size').count().reset_index()

fig = px.bar(df_, x='size', y='fn')
fig.show()

In [7]:
# creating all the ddgp instances associated with the test sample
import os

os.makedirs('ddgp', exist_ok=True)


def create_ddgp(row):
    order = row['order']
    
    n = len(order)

    # set permutation from original indices to ddgp indices
    permutation = np.zeros(n, dtype=int) 
    for k, i in enumerate(order):
        permutation[i] = k

    parents = []
    for v in row['parents']:
        parents.append([permutation[i] for i in v])
    
    # The atoms follow the following ddgp order: N^1, CA^1, HA^1, ..., C^{j-1}, H^j, N^j, CA^j, HA^j, ..., C^nres
    d = {}
    xsol = row['xsol']
    # exact distances between the first three atoms
    for i in range(3):
        d[i] = {}
        for j in range(i-1, -1, -1):
            d[i][j] = norm(xsol[order[i]] - xsol[order[j]])
    
    H = []
    # exact distances associated with the atoms of each block ( C^{j-1}, H^j, N^j, CA^j, HA^j ), with j in {2,n}
    for i in range(3, n-1, 5):
        # i: C^{j-1}
        d[i] = {}
        d[i][i-1] = norm(xsol[order[i]] - xsol[order[i-1]])
        d[i][i-2] = norm(xsol[order[i]] - xsol[order[i-2]])
        d[i][i-3] = norm(xsol[order[i]] - xsol[order[i-3]])
        
        # i+1: H^j
        d[i+1] = {}
        d[i+1][i] = norm(xsol[order[i+1]] - xsol[order[i]])
        d[i+1][i-1] = norm(xsol[order[i+1]] - xsol[order[i-1]])
        d[i+1][i-2] = norm(xsol[order[i+1]] - xsol[order[i-2]])
        H.append(i+1)

        # i+2: N^j
        d[i+2] = {}
        d[i+2][i+1] = norm(xsol[order[i+2]] - xsol[order[i+1]])
        d[i+2][i] = norm(xsol[order[i+2]] - xsol[order[i]])
        d[i+2][i-2] = norm(xsol[order[i+2]] - xsol[order[i-2]])

        # i+3: CA^j
        d[i+3] = {}
        d[i+3][i+2] = norm(xsol[order[i+3]] - xsol[order[i+2]])
        d[i+3][i+1] = norm(xsol[order[i+3]] - xsol[order[i+1]])
        d[i+3][i] = norm(xsol[order[i+3]] - xsol[order[i]])
        d[i+3][i-2] = norm(xsol[order[i+3]] - xsol[order[i-2]])

        # i+4: HA^j
        d[i+4] = {}
        d[i+4][i+3] = norm(xsol[order[i+4]] - xsol[order[i+3]])
        d[i+4][i+2] = norm(xsol[order[i+4]] - xsol[order[i+2]])
        d[i+4][i+1] = norm(xsol[order[i+4]] - xsol[order[i+1]])
        H.append(i+4)
    
    # distances associated with the atom C^nres
    i = i+5
    d[i] = {}
    d[i][i-1] = norm(xsol[order[i]] - xsol[order[i-1]])
    d[i][i-2] = norm(xsol[order[i]] - xsol[order[i-2]])
    d[i][i-3] = norm(xsol[order[i]] - xsol[order[i-3]])

    # distances between hydrogen atoms that are not the adjacent and that are at most 5 angstroms apart
    for i in range(len(H)):
        for j in range(i-2):
            # (i+1: H^j) and (i+4: HA^j)
            dij = norm(xsol[order[H[i]]] - xsol[order[H[j]]])
            if dij <= 5:
                d[i][j] = dij

    # symmetrize
    for i in range(n):
        for j in d[i].keys():
            d[j][i] = d[i][j]
    
    D = DDGP(d, parents)

    # pickle DDGP
    # fn_ddgp = row['fn'].replace('binary', 'ddgp').replace('.csv', '.pkl')
    fn_ddgp = os.path.basename(row['fn']).replace('binary', 'ddgp').replace('.csv', '.pkl')
    fn_ddgp = os.path.join('ddgp', fn_ddgp)
    with open(fn_ddgp, 'wb') as f:
        pickle.dump(D, f)
        

for idx, row in tqdm(list(M.iterrows())):
    try:
        create_ddgp(row)
    except Exception as e:
        print(idx, e)
        print(row['fn'])
        raise e

100%|██████████| 14751/14751 [00:38<00:00, 382.97it/s]


In [6]:
import pandas as pd
import numpy as np
import plotly.express as px

df_speedup = pd.read_excel('df_speedup.xlsx')
df_speedup = df_speedup.dropna().reset_index()
# df_speedup = df_speedup[df_speedup['speed_up'] != 1].reset_index()

# max_speedup = df_speedup['speed_up'].max()
# max_speedup = max_speedup - 1.0
# df_speedup['speed_up'] = df_speedup.apply(lambda x : float(1.0 + ((x.loc['speed_up'] - 1.0) * 9.0 / max_speedup)) if x.loc['speed_up'] > 1 else x.loc['speed_up'], axis=1)

df_speedup['speed_up'] = df_speedup.apply(lambda x : float(np.log10(x.loc['speed_up'])), axis=1)

df_speedup.to_excel('df_speedup_without_none.xlsx', index=False)

fig = px.histogram(df_speedup, x="speed_up", nbins=50)
fig.show()
arroz = 1

SyntaxError: invalid syntax (1743905887.py, line 2)