In [1]:
import joblib
import pandas as pd

## Package installation from jupyter

In [3]:
!pip install laplaciannb --upgrade

Collecting laplaciannb
  Using cached laplaciannb-0.4.1-py3-none-any.whl (6.0 kB)
Installing collected packages: laplaciannb
  Attempting uninstall: laplaciannb
    Found existing installation: laplaciannb 0.4
    Uninstalling laplaciannb-0.4:
      Successfully uninstalled laplaciannb-0.4
Successfully installed laplaciannb-0.4.1


In [4]:
from bayes.LaplacianNB import LaplacianNB

## Small utility function to process smiles into a set of indices of positive bits

In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem

def get_fp(smiles: str) -> set:
    """Function to calculate MorganFingerprint from smiles.
    It returns index of all '1' bits of not-folded fingerprint.
    Args:
        smiles (str): smiles string
    Returns:
        set: return list of index of '1' bits.
    """

    mol = Chem.MolFromSmiles(smiles)

    if not mol:
        return

    fp = AllChem.GetMorganFingerprint(mol, 2)
    if not fp:
        return

    return set(fp.GetNonzeroElements().keys())

## Create a example DataFrame

In [6]:
df = pd.DataFrame({'smiles': ['N[C@]([H])(C)C(=O)O', 'O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O', 'CN=C=O'], 'activity': [1, 0, 0]})

In [7]:
df

Unnamed: 0,smiles,activity
0,N[C@]([H])(C)C(=O)O,1
1,O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O,0
2,CN=C=O,0


In [8]:
df['sets'] = df['smiles'].apply(lambda x: get_fp(x))

In [9]:
df

Unnamed: 0,smiles,activity,sets
0,N[C@]([H])(C)C(=O)O,1,"{2245273601, 2246728737, 2655406212, 153386432..."
1,O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O,0,"{2076190208, 864942730, 2900751504, 2458968089..."
2,CN=C=O,0,"{2246728737, 2245900962, 864942730, 3823506351..."


## Fit function

In [10]:
X = df['sets']

In [11]:
y = df['activity']

In [12]:
clf = LaplacianNB()

In [13]:
clf.fit(X, y)

## Get a sum of features probabilities for each compound per class [0, 1]

In [14]:
clf._joint_log_likelihood(X)

array([[-5.7550254,  4.920233 ],
       [ 2.962594 , -4.941602 ],
       [ 0.9315465, -1.5314839]], dtype=float32)

## Get probability of each class (sklearn implementation)

In [15]:
clf.predict_proba(X)

array([[2.3109160e-05, 9.9997705e-01],
       [9.9963105e-01, 3.6905482e-04],
       [9.2150915e-01, 7.8490861e-02]], dtype=float32)

## Get prediction of each class (sklearn implementation)

In [16]:
clf.predict(X)

array([1, 0, 0])

## Get class names

In [17]:
clf.classes_

array([0, 1])

## Get index of positive bit mapping to feature space -> key: value of an index, value: index in feature table (see below)

In [18]:
clf.feature_names_

{26234434: 0,
 847336149: 1,
 847957139: 2,
 864662311: 3,
 864674487: 4,
 864942730: 5,
 932712697: 6,
 951226070: 7,
 976134192: 8,
 994485099: 9,
 1135286194: 10,
 1310068516: 11,
 1510328189: 12,
 1510337516: 13,
 1516788326: 14,
 1517923320: 15,
 1533864325: 16,
 1879233475: 17,
 2038990649: 18,
 2076190208: 19,
 2245273601: 20,
 2245900962: 21,
 2246699815: 22,
 2246703798: 23,
 2246728737: 24,
 2458968089: 25,
 2549196227: 26,
 2599973650: 27,
 2625182169: 28,
 2655406212: 29,
 2900751504: 30,
 3011598321: 31,
 3026394695: 32,
 3217380708: 33,
 3218693969: 34,
 3537119515: 35,
 3725073659: 36,
 3823506351: 37,
 3855312692: 38,
 3945128999: 39,
 3975275337: 40,
 4046184955: 41}

## Get log probability per feature/index

In [19]:
clf.feature_log_prob_

array([[ 0.14884563,  0.14884563, -0.54430157, -0.20163734,  0.14884563,
        -0.05518642,  0.14884563,  0.14884563,  0.14884563,  0.14884563,
         0.14884563,  0.14884563, -0.54430157,  0.14884563,  0.14884563,
         0.14884563, -0.54430157,  0.14884563,  0.14884563,  0.14884563,
        -0.54430157,  0.14884563, -0.54430157,  0.14884563, -0.05518642,
         0.14884563,  0.14884563, -0.54430157, -0.54430157, -0.54430157,
         0.14884563,  0.14884563,  0.14884563,  0.14884563,  0.14884563,
        -0.54430157,  0.14884563,  0.14884563, -0.54430157,  0.14884563,
         0.14884563,  0.14884563],
       [-0.24419697, -0.24419697,  0.44895023,  0.25283533, -0.24419697,
         0.08894748, -0.24419697, -0.24419697, -0.24419697, -0.24419697,
        -0.24419697, -0.24419697,  0.44895023, -0.24419697, -0.24419697,
        -0.24419697,  0.44895023, -0.24419697, -0.24419697, -0.24419697,
         0.44895023, -0.24419697,  0.44895023, -0.24419697,  0.08894748,
        -0.24419