In [None]:
%load_ext autoreload
%autoreload 2

# Maserati Data

In [None]:
from rdkit import Chem
import pandas as pd
import os, pickle, gzip, zipfile, shutil
import subprocess as sp

### Clean working directory

In [None]:
names = []
for root, dirs, files in os.walk('work'):
    for f in files:
        if f.endswith('_charged.mol2'):
            names.append(f'{root}/{f}')
        if f.endswith('_apo.pdb'):
            names.append(f'{root}/{f}')
        if f.endswith('_apo.mol2'):
            names.append(f'{root}/{f}')
for fname in names:
    os.unlink(fname)

### For GNN-DTI

In [None]:
valid_dirs = []
for root, files, dirs in os.walk('work'):
    if os.path.exists(f'{root}/status'):
        status = open(f'{root}/status', 'rt').read()
    else:
        status = None
    if status == 'docked':
        valid_dirs.append(root)
len(valid_dirs)

In [None]:
os.makedirs('2018', exist_ok=True)
os.makedirs('select', exist_ok=True)

df = pd.read_pickle('index_2019.pkl.gz')

counts = {'select': 0, '2018': 0}

for d in valid_dirs:
    pdb_code = d[-4:]

    r = df[df['pdb'] == pdb_code]
    year = r['year'].item()
    ligand_name = r['lig'].item()
    select = r['select'].item()

    if not select:
        continue

    if year == 2018:
        counts['2018'] += 1
        dest = f'2018/{pdb_code}'
    else:
        counts['select'] += 1
        dest = f'select/{pdb_code}'

    pocket_fname = f'{d}/{pdb_code}_pocket.pdb'
    pocket_mol = Chem.MolFromPDBFile(pocket_fname)
    ligand_fname = f'{d}/{ligand_name}.sdf'
    for ligand_mol in Chem.SDMolSupplier(ligand_fname):
        break
    pair = (ligand_mol, pocket_mol)
    pair_dest_fname = f'{dest}/{pdb_code}_{ligand_name}_pair.pkl.gz'
    apo_fname = f'{d}/{pdb_code}_apo.pdb.gz'
    value_fname = f'{d}/value'
    data_fname = f'{d}/{pdb_code}_{ligand_name}_data.pkl.gz'
    rmsd_fname = f'{d}/rmsd'
    docked_fname = f'{d}/{pdb_code}_{ligand_name}_docked.sdf'
    
    os.makedirs(dest, exist_ok=True)
    pickle.dump(pair, gzip.open(pair_dest_fname, 'wb'))
    shutil.copy(pocket_fname, dest)
    shutil.copy(apo_fname, dest)
    shutil.copy(value_fname, dest)
    shutil.copy(data_fname, dest)
    shutil.copy(rmsd_fname, dest)
    shutil.copy(docked_fname, dest)

sp.call(f'tar jcvf cbidata_full.tar.bz2 2018/ select/', shell=True)

shutil.rmtree('2018')
shutil.rmtree('select')
counts

### For GNN-DTI Essentials (Hasegawa selection)

In [None]:
os.makedirs('2018', exist_ok=True)
os.makedirs('select', exist_ok=True)

df = pd.read_pickle('index_2019.pkl.gz')

counts = {'select': 0, '2018': 0}

for d in valid_dirs:
    pdb_code = d[-4:]

    r = df[df['pdb'] == pdb_code]
    year = r['year'].item()
    ligand_name = r['lig'].item()
    select = r['select'].item()

    if not select:
        continue

    if year == 2018:
        counts['2018'] += 1
        dest = f'2018/{pdb_code}'
    else:
        counts['select'] += 1
        dest = f'select/{pdb_code}'

    pocket_fname = f'{d}/{pdb_code}_pocket.pdb'
    pocket_mol = Chem.MolFromPDBFile(pocket_fname)
    ligand_fname = f'{d}/{ligand_name}.sdf'
    for ligand_mol in Chem.SDMolSupplier(ligand_fname):
        break
    pair = (ligand_mol, pocket_mol)
    pair_dest_fname = f'{dest}/{pdb_code}_{ligand_name}_pair.pkl.gz'
    value_fname = f'{d}/value'
    
    os.makedirs(dest, exist_ok=True)
    pickle.dump(pair, gzip.open(pair_dest_fname, 'wb'))
    shutil.copy(value_fname, dest)

sp.call(f'tar jcvf cbidata_gnn_dti.tar.bz2 2018/ select/', shell=True)

shutil.rmtree('2018')
shutil.rmtree('select')
counts