In [137]:
import subprocess
import tempfile
import os
import sys
from collections import defaultdict
from typing import Union

class Miner:
    def __init__(self, miner_path="./miner", base_file_name: Union[None, str] = None, verbose: bool = False):
        self.miner_path = miner_path
        self.verbose = verbose
        self.base_file_name = tempfile.mktemp() if not base_file_name else base_file_name
        self.input_file_name = self.base_file_name + "_input"
        self.substructure_file_name = self.base_file_name + "_output.txt"
        self.identifiers_file_name = self.base_file_name + "_ids.txt"


    def runButCooler(self, x: list[str], **kwargs):
        self.save_input_to_file(x)
        command = self.build_command(**kwargs)
        success = self.run_miner(command)
        if success is True:
            result = self.parse_output_but_cooler()
            return result

    def save_input_to_file(self, x: list[str]) -> None:
        with open(self.input_file_name, 'w') as input_file:
            for smile in x:
                line = f"{smile},0,{smile}\n"
                input_file.write(line)

    def build_command(self, **kwargs) -> list[str]:
        command = [self.miner_path, self.input_file_name]
        for key, value in kwargs.items():
            if isinstance(value, bool):
                if value:
                    command.append(f"-{key}")
            else:
                command.append(f"-{key}{value}")

        command.append(self.substructure_file_name)
        command.append(self.identifiers_file_name)

        return command

    def run_miner(self, command: list[str]) -> bool:
        try:
            result = subprocess.run(command, capture_output=self.verbose, text=True)
            result.check_returncode()

        except subprocess.CalledProcessError as e:
            print("Error running MoSS:", e.stderr)
            return False
        except Exception as e:
            print("An error occurred:", str(e))
            return False
        return True

    def parse_output_but_cooler(self) -> dict[str, list[str]]:
        result = defaultdict(list)
        with open(self.substructure_file_name, "r") as substructure_file, open(self.identifiers_file_name, "r") as identifiers_file:
            substucture_lines = substructure_file.read().strip().split("\n")[1:]
            molecular_lines = identifiers_file.read().strip().split("\n")[1:]

            assert(len(substucture_lines) == len(molecular_lines))

            for substructure_line, molecular_line in zip(substucture_lines, molecular_lines):
                entities = molecular_line.split(':')[1].split(',')
                smile = substructure_line.split(',')[1]
                for entity in entities:
                    result[entity].append(smile)

        return dict(result)






    def run(self, input_file, output_file=None, **kwargs):
        # Build the command with the provided arguments
        command = [self.miner_path, input_file]

        # Add additional command-line arguments
        for key, value in kwargs.items():
            if isinstance(value, bool):
                if value:
                    command.append(f"-{key}")
            else:
                command.append(f"-{key}{value}")

        # Add the output file if specified
        base_file_name = tempfile.mktemp() if not output_file else output_file
        substructure_file_name = base_file_name + "_output.txt"
        identifiers_file_name = base_file_name + "_ids.txt"

        command.append(substructure_file_name)
        command.append(identifiers_file_name)

        output = None
        try:
            # Run the command and capture the output
            # print(f"command being executed: {command}")
            result = subprocess.run(command, capture_output=True, text=True)
            result.check_returncode()  # Raise an error if the command failed

            # Read and parse the output file
            with open(substructure_file_name, "r") as substructure_file, open(identifiers_file_name, "r") as identifiers_file:
                output = self.parse_output(substructure_file.read(), identifiers_file.read()) # TODO refactor this somehow


        except subprocess.CalledProcessError as e:
            print("Error running MoSS:", e.stderr)
        except Exception as e:
            print("An error occurred:", str(e))
        finally:
            if not output_file and os.path.exists(substructure_file_name):
                os.remove(substructure_file_name)

            return output



    def parse_output(self, substucture_data, molecular_id_data):
        result = defaultdict(list)
        substucture_lines = substucture_data.strip().split("\n")[1:]
        molecular_lines = molecular_id_data.strip().split("\n")[1:]

        assert(len(substucture_lines) == len(molecular_lines))

        for substructure_line, molecular_line in zip(substucture_lines, molecular_lines):
            entities = molecular_line.split(':')[1].split(',')
            smile = substructure_line.split(',')[1]
            for entity in entities:
                result[entity].append(smile)

        return dict(result)


In [166]:
from skfp.datasets.moleculenet import load_bace
from skfp.model_selection import scaffold_train_test_split
smiles_list, y = load_bace()
mols_train, mols_test, y_train, y_test = scaffold_train_test_split(smiles_list, y, test_size=0.2)

In [167]:
len(mols_train)

1210

In [148]:
miner = Miner(base_file_name="test")
results = miner.runButCooler(mols_train, jS=True, s=5)

moss.Miner - molecular substructure miner (MoSS)
version 8.3 (2022.11.19)    (c) 2002-2022 Christian Borgelt
parsing seed description ... [1 atom(s), 0 bond(s)] done.
parsing excluded atom types ... [1 atom(s)] done.
reading molecules ... [1210 (1210+0) molecule(s)] done [0.013s].
marking bridges ... [1210 molecule(s)] done [0.001s].
masking atom and bond types ... [1210 molecule(s)] done [0.001s].
preparing/recoding molecules ... [1210 molecule(s)] done [0.002s].
embedding the seed ... [1210 (380+0) molecule(s)] done [0.0s].
searching for substructures ... [361 substructure(s)] done [4.226s].
search statistics:
maximum search tree height   : 30
number of search tree nodes  : 37793
number of created fragments  : 146943
number of created embeddings : 27292135
insufficient support pruning : 105409
perfect extension pruning    : 2830
equivalent sibling pruning   : 0
canonical form pruning       : 279
ring order pruning           : 0
duplicate fragment pruning   : 0
non-closed fragments   

In [175]:
xdata = zip(mols_train, y_train)

y_train_new = [y for x, y in xdata if x in results.keys()]

len(y_train_new)

380

In [177]:
len(results.keys())

380

In [178]:
y_train = y_train_new

In [179]:
from sklearn.preprocessing import MultiLabelBinarizer

vocabulary = list(set([substructure for x in results.values() for substructure in x]))

mlb = MultiLabelBinarizer(classes=vocabulary)
mlb.fit(vocabulary)

results_transformed = {key: mlb.transform([value])[0] for key, value in results.items()}

In [181]:
test_results = miner.runButCooler(mols_test, jS=True, s=5)
test_results_transformed = {key: mlb.transform([value])[0] for key, value in test_results.items()}

moss.Miner - molecular substructure miner (MoSS)
version 8.3 (2022.11.19)    (c) 2002-2022 Christian Borgelt
parsing seed description ... [1 atom(s), 0 bond(s)] done.
parsing excluded atom types ... [1 atom(s)] done.
reading molecules ... [303 (303+0) molecule(s)] done [0.003s].
marking bridges ... [303 molecule(s)] done [0.0s].
masking atom and bond types ... [303 molecule(s)] done [0.001s].
preparing/recoding molecules ... [303 molecule(s)] done [0.0s].
embedding the seed ... [303 (109+0) molecule(s)] done [0.0s].
searching for substructures ... [251 substructure(s)] done [0.459s].
search statistics:
maximum search tree height   : 30
number of search tree nodes  : 14728
number of created fragments  : 45286
number of created embeddings : 2455590
insufficient support pruning : 30057
perfect extension pruning    : 165
equivalent sibling pruning   : 0
canonical form pruning       : 284
ring order pruning           : 0
duplicate fragment pruning   : 0
non-closed fragments         : 14477


In [195]:
y_test_new = [y for x, y in zip(mols_test, y_test) if x in test_results.keys()]
y_test = y_test_new

In [138]:
results

{'S(=O)(=O)(N(C)c1cc(cc(c1)C(=O)NC(C)c1ccc(F)cc1)-c1nc([nH]n1)C([NH3+])(Cc1ccccc1)C)C': ['S(-N(-c1:c:c(-C):c:c(-C(-N-C(-C)-C)=O):c:1)-C)(-C)(=O)=O',
  'S(-N(-c(:c:c-C):c:c(-C(-N-C(-C)-C)=O):c)-C)(-C)(=O)=O',
  'S(-N(-c(:c:c(-C):c):c:c-C(-N-C(-C)-C)=O)-C)(-C)(=O)=O',
  'S(-N(-c1:c:c:c:c(-C(-N-C(-C)-C)=O):c:1)-C)(-C)(=O)=O',
  'S(-N(-c(:c:c):c:c-C(-N-C(-C)-C)=O)-C)(-C)(=O)=O',
  'S(-N(-c1:c:c:c:c:c:1)-C)(-C)(=O)=O',
  'S(-N(-c:c:c(-C(-N-C(-C)-C)=O):c:c)-C)(-C)(=O)=O',
  'S(-N(-c:c:c-C(-N-C(-C)-C)=O)-C)(-C)(=O)=O',
  'S(-N(-c:c:c:c:c-C(-N-C(-C)-C)=O)-C)(-C)(=O)=O',
  'S(-N(-c:c:c:c:c)-C)(-C)(=O)=O',
  'S(-N(-C)-C)(-C)(=O)=O',
  'S(-N-c1:c:c(-C):c:c(-C-N-C-C):c:1)(-C)(=O)=O',
  'S(-N-c(:c:c-C):c:c(-C-N-C-C):c)(-C)(=O)=O',
  'S(-N-c(:c:c(-C):c):c:c-C-N-C-C)(-C)(=O)=O',
  'S(-N-c1:c:c:c:c(-C(-N-C(-C)-C)=O):c:1)(-C)(=O)=O',
  'S(-N-c1:c:c:c:c(-C-N-C-C):c:1)(-C)(=O)=O',
  'S(-N-c(:c:c):c:c-C(-N-C(-C)-C)=O)(-C)(=O)=O',
  'S(-N-c(:c:c):c:c-C-N-C-C)(-C)(=O)=O',
  'S(-N-c1:c:c:c:c:c:1)(-C)(=O)=O',

In [182]:
list(results_transformed.values())[0].shape

(361,)

In [183]:
X_train = np.array(list(results_transformed.values()))
X_test = np.array(list(test_results_transformed.values()))

X_train.shape

(380, 361)

In [184]:
len(y_train)

380

In [194]:
(X_train.shape, len(y_train))

((380, 361), 380)

In [196]:
(X_test.shape, len(y_test))

((109, 361), 109)

# 

In [198]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_pred)

print(f"AUROC: {auroc:.2%}")

AUROC: 41.40%
