In [None]:
%load_ext autoreload
%autoreload 2

# Maserati Data

In [None]:
from rdkit import Chem
import pandas as pd
import os, pickle, gzip, zipfile

In [None]:
valid_dirs = []
for root, files, dirs in os.walk('work'):
    if os.path.exists(f'{root}/status'):
        status = open(f'{root}/status', 'rt').read()
    else:
        status = None
    if status in ['prep', 'matrix', 'docked']:
        valid_dirs.append(root)
len(valid_dirs)

### For GNN-DTI

In [None]:
count = 0
os.makedirs('tmp', exist_ok=True)
for d in valid_dirs:
    count += 1
    pdb_code = d[-4:]
    pocket_fname = f'{d}/{pdb_code}_pocket.pdb'
    pocket_mol = Chem.MolFromPDBFile(pocket_fname)
    #print(Chem.MolToSmiles(pocket_mol))
    for f in os.listdir(d):
        if f.endswith('.sdf') and '_docked' not in f:
            ligand_name = f[:3]
            ligand_fname = f'{d}/{f}'
            for ligand_mol in Chem.SDMolSupplier(ligand_fname):
                break
            break

    print(count, pdb_code)

    zipf = zipfile.ZipFile(f'tmp/{pdb_code}.zip', 'w')

    pair_fname = f'tmp/{pdb_code}_{ligand_name}_pair.pkl.gz'
    pair_obj = (ligand_mol, pocket_mol)
    pickle.dump(pair_obj, gzip.open(pair_fname, 'wb'), protocol=4)
    zipf.write(pair_fname, arcname=os.path.basename(pair_fname))
    os.unlink(pair_fname)

    apo_pdbgz = f'work/{pdb_code}/{pdb_code}_apo.pdb.gz'
    zipf.write(apo_pdbgz, arcname=os.path.basename(apo_pdbgz))
    
    pocket_pdb = f'work/{pdb_code}/{pdb_code}_pocket.pdb'
    zipf.write(pocket_pdb, arcname=os.path.basename(pocket_pdb))
    
    for f in os.listdir(d):
        if f.endswith('_docked.sdf'):
            docked_sdf = f'{d}/{f}'
            break
    zipf.write(docked_sdf, arcname=os.path.basename(docked_sdf))
    
    zipf.write(f'{d}/rmsd', arcname='rmsd')
    zipf.write(f'{d}/value', arcname='value')
    
    zipf.close()

### For GNN-DTI Essentials (Hasegawa selection)

In [None]:
count = 0
os.makedirs('tmp_2018', exist_ok=True)
os.makedirs('tmp', exist_ok=True)

df = pd.read_pickle('index_2019.pkl.gz')

for d in valid_dirs:
    select = bool(int(open(f'{d}/select', 'rt').read().strip()))
    if not select:
        continue
    count += 1
    pdb_code = d[-4:]
    year = df[df['pdb'] == pdb_code]['year'].item()
    pocket_fname = f'{d}/{pdb_code}_pocket.pdb'
    pocket_mol = Chem.MolFromPDBFile(pocket_fname)
    for f in os.listdir(d):
        if f.endswith('.sdf') and '_docked' not in f:
            ligand_name = f[:3]
            ligand_fname = f'{d}/{f}'
            for ligand_mol in Chem.SDMolSupplier(ligand_fname):
                break
            break

    print(count, pdb_code, year)

    if year == 2018:
        tmpdir = 'tmp_2018'
    else:
        tmpdir = 'tmp'
    
    zipf = zipfile.ZipFile(f'{tmpdir}/{pdb_code}.zip', 'w')

    pair_fname = f'{tmpdir}/{pdb_code}_{ligand_name}_pair.pkl.gz'
    pair_obj = (ligand_mol, pocket_mol)
    pickle.dump(pair_obj, gzip.open(pair_fname, 'wb'), protocol=4)
    zipf.write(pair_fname, arcname=os.path.basename(pair_fname))
    os.unlink(pair_fname)

    zipf.write(f'{d}/value', arcname='value')
    
    zipf.close()