In [None]:
%load_ext autoreload
%autoreload 2

# Dataset

### Needs rdkit>=2020.03.3

In [None]:
import rdkit
assert '2020.03.3' <= rdkit.__version__

In [None]:
from rdkit import Chem
from cbiprep.ligand_expo import LigandExpo
from cbiprep.pdbatoms import PDBAtoms
from cbiprep.matrix import GetTopologicalMatrix
from cbiprep.atomtyper import AtomTyper, HybAtomTyper, GetAtomVector
from cbiprep.jupyter_utils import draw_mol, check_dir
import pandas as pd
import os, sys, gzip, pickle

In [None]:
lig_expo = LigandExpo()

In [None]:
df = pd.read_pickle('index_2019.pkl.gz')

In [None]:
DF = df[(df['type'] == 'Kd') & (df['lig_ok'] == True) & (df['refined'] == True)]
DF

What about ligand_thres = 60, pocket_thres = 300? Totally dimension = 400

### Create ligand and pocket

In [None]:
ligexpo = LigandExpo()

count = 0
for i in DF.index:
    r = DF.loc[i]
    pdb_code = r['pdb']
    ligand_name = r['lig']
    
    if os.path.exists(f'work/{pdb_code}/status'):
        status = open(f'work/{pdb_code}/status', 'rt').read()
    else:
        status = None
    if status in ['prep', 'matrix', 'docked']:
        continue

    value = r['pval'].item()
    pdb_atoms = PDBAtoms(f'pdb/{pdb_code}.pdb.gz')
    ligand_atoms = pdb_atoms.get_ligand(ligand_name)
    if len(ligand_atoms) == 0:
        continue
    if 60 < len(ligand_atoms):
        continue
    smi = ligexpo[ligand_name]
    try:
        ligand_mol = ligexpo.assign(ligand_atoms, ligand_name)
    except ValueError:
        continue
    pocket_atoms = pdb_atoms.get_pocket(ligand_atoms, thres=4.0)
    if 240 < len(pocket_atoms):
        continue
    protein_atoms = pdb_atoms.get_relevant_protein(ligand_atoms, thres=5.0)
    count += 1
    print(count, pdb_code, ligand_name, len(ligand_atoms), len(pocket_atoms), len(protein_atoms))
    os.makedirs(f'work/{pdb_code}', exist_ok=True)
    open(f'work/{pdb_code}/{pdb_code}_pocket.pdb', 'wt').write(str(pocket_atoms))
    sdwriter = Chem.SDWriter(f'work/{pdb_code}/{ligand_name}.sdf')
    sdwriter.write(ligand_mol)
    sdwriter.close()
    gzip.open(f'work/{pdb_code}/{pdb_code}_apo.pdb.gz', 'wt').write(str(protein_atoms))
    open(f'work/{pdb_code}/value', 'wt').write(str(value))
    open(f'work/{pdb_code}/status', 'wt').write('prep')

### Create matrices

In [None]:
valid_dirs = []
for root, files, dirs in os.walk('work'):
    if os.path.exists(f'{root}/status'):
        status = open(f'{root}/status', 'rt').read()
    else:
        status = None
    if status == 'prep':
        valid_dirs.append(root)
valid_dirs

In [None]:
ng_count = 0
count = 0
for d in valid_dirs:
    count += 1
    if os.path.exists(f'{d}/status'):
        status = open(f'{d}/status', 'rt').read()
    else:
        status = None
    if status in ['matrix', 'docked']:
        continue
    for fname in os.listdir(d):
        if fname.endswith('_apo.pdb.gz'):
            pdb_code = fname[:4]
            pdb = f'pdb/{pdb_code}.pdb.gz'
        if fname.endswith('.sdf') and len(fname) == 7:
            ligand_name = fname[:3]
            ligand_sdf = f'{d}/{fname}'
    pdb_atoms = PDBAtoms(pdb)
    ligand_atoms = pdb_atoms.get_ligand(ligand_name)
    pocket_atoms = pdb_atoms.get_pocket(ligand_atoms, thres=4.0)
    mat = GetTopologicalMatrix(ligand_atoms, pocket_atoms, ligand_thres=60, pocket_thres=340)
    for ligand_mol in Chem.SDMolSupplier(ligand_sdf):
        break
    try:
        types_vec = GetAtomVector(ligand_mol, pocket_atoms, atomtyper=HybAtomTyper, ligand_thres=60, pocket_thres=340)
    except:
        ng_count += 1
        print(ng_count, count, pdb_code)
        continue
    data = dict(A=types_vec, D=mat)
    pickle.dump(data, gzip.open(f'{d}/{pdb_code}_{ligand_name}_data.pkl.gz', 'wb'), protocol=4)
    open(f'{d}/status', 'wt').write('matrix')
print(ng_count, count)