In [1]:
import pandas as pd
from rdkit import Chem 
from rdkit.Chem import Lipinski, Descriptors, rdMolDescriptors, AllChem, PandasTools
from pandarallel import pandarallel
import numpy as np
pandarallel.initialize(progress_bar=False)
import tmap as tm
from map4 import MAP4Calculator
import os
import joblib
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
folder = "/data/coconut/"

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
coconut = pd.read_pickle(folder + "coconut_prop.pkl")

In [3]:
origins = ["plants", "fungi", "bacteria", "animal", "Homo_sapiens", "marine", "other"]
def origin(simple_tax):
    if simple_tax in origins:
        return origins.index(simple_tax)


In [4]:
coconut["origin"] = coconut.simple_tax.map(origin)

In [5]:
def norm_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    smiles_n = Chem.MolToSmiles(mol, isomericSmiles=False)
    return smiles_n

coconut["norm_smiles"] = coconut["SMILES"].map(norm_smiles) 

In [6]:
coconut = coconut.query("origin != 6 and origin != 5 and origin != 4 and origin != 3").sort_values("origin", ascending=False)

In [7]:
coconut = coconut.drop_duplicates(subset="norm_smiles")


In [8]:
len(coconut.query("origin==0"))

33821

In [9]:
len(coconut)

60333

In [10]:
len(coconut.query("origin==1"))

15693

In [11]:
len(coconut.query("origin==2"))

10819

In [12]:
len(coconut.query("origin==3"))

0

In [13]:
len(coconut.query("origin==4"))

0

In [14]:
len(coconut.query("origin==5"))

0

In [15]:
coconut["Set"] = "test"
for i in range(6):
    print(i)
    coconut_training = coconut.query("origin == @i").sample(frac=0.50, random_state=12)
    coconut.loc[coconut_training.index, "Set"] = "training"


pd.to_pickle(coconut, folder + "coconut_4classifier_less_classes.pkl")

0
1
2
3
4
5


In [16]:
def map4_kernel_SVM(a, b=None):
    if b is None:
        b=a
    JS_all_pairs = np.zeros((len(a),len(b)))
    for i,fp1 in enumerate(a):
        for j,fp2 in enumerate(b):
            JS_all_pairs[i,j] = np.float(np.count_nonzero(fp1 == fp2)) / np.float(len(fp1))
    return JS_all_pairs

In [17]:
coconut.replace(["NaN", 'NaT'], np.nan, inplace = True)
coconut = coconut.dropna()

  mask = arr == x
  mask |= arr == x


In [18]:
# preparing features for training (X) and test (x)
X = np.array(coconut[coconut["Set"] == "training"].MAP4.to_list())
x = np.array(coconut[coconut["Set"] == "test"].MAP4.to_list())
# generate prediction for training (Y) and test(y) sets
Y = coconut[coconut["Set"] == "training"].origin.values
y = coconut[coconut["Set"] == "test"].origin.values
Y = np.array(Y, dtype=np.float)
y = np.array(y, dtype=np.float)

In [19]:
features = ['molecular_weight','number_of_carbons',\
       'number_of_nitrogens', 'number_of_oxygens',\
       'total_atom_number', 'bond_count', 'topoPSA', 'fcsp3', 'HBA',\
       'HBD', 'aLogP']

In [20]:
for feat in features:
    try:
        coconut_prop[f"{feat}"]=coconut_prop[f"{feat}"].map(float) 
    except:
        print(feat)
X_prop = np.array(coconut_prop[coconut_prop["Set"] == "training"][features].values)
x_prop = np.array(coconut_prop[coconut_prop["Set"] == "test"][features].values)
Y_prop = coconut_prop[coconut_prop["Set"] == "training"].origin.values
y_prop = coconut_prop[coconut_prop["Set"] == "test"].origin.values
Y_prop = np.array(Y_prop, dtype=np.float)
y_prop = np.array(y_prop, dtype=np.float)

molecular_weight
number_of_carbons
number_of_nitrogens
number_of_oxygens
total_atom_number
bond_count
topoPSA
fcsp3
HBA
HBD
aLogP


NameError: name 'coconut_prop' is not defined

In [None]:
base_clf = make_pipeline(svm.SVC(probability=False, class_weight="balanced", cache_size=1900, random_state=12))

In [None]:
if not os.path.exists(folder+"MAP4_classifier_eval.pkl"):
    # SVM optimization for ROC auc
    param_grid = {'svc__C': [0.1,1, 10, 100, 1000], \
                  'svc__kernel': [map4_kernel_SVM]}
    grid = GridSearchCV(base_clf, param_grid, scoring='balanced_accuracy',\
                        refit=True,verbose=2, n_jobs=7)
    grid.fit(X,Y)
    print(grid.best_estimator_)
    
    joblib.dump(grid, folder + "SVM-MAP4_classifier_eval.pkl")
else:
    grid = joblib.load(folder + "SVM-MAP4_classifier_eval.pkl")

In [None]:
y_predicted = grid.predict(x)

In [None]:
from itertools import product

import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.utils import check_matplotlib_support
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.base import is_classifier
import matplotlib

class ConfusionMatrixDisplay:
    """Confusion Matrix visualization.
    It is recommend to use :func:`~sklearn.metrics.plot_confusion_matrix` to
    create a :class:`ConfusionMatrixDisplay`. All parameters are stored as
    attributes.
    Read more in the :ref:`User Guide <visualizations>`.
    Parameters
    ----------
    confusion_matrix : ndarray of shape (n_classes, n_classes)
        Confusion matrix.
    display_labels : ndarray of shape (n_classes,)
        Display labels for plot.
    Attributes
    ----------
    im_ : matplotlib AxesImage
        Image representing the confusion matrix.
    text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, \
            or None
        Array of matplotlib axes. `None` if `include_values` is false.
    ax_ : matplotlib Axes
        Axes with confusion matrix.
    figure_ : matplotlib Figure
        Figure containing the confusion matrix.
    """
    def __init__(self, confusion_matrix, display_labels):
        self.confusion_matrix = confusion_matrix
        self.display_labels = display_labels

    def plot(self, include_values=True, cmap='viridis',
             xticks_rotation='horizontal', values_format=None, ax=None):
        """Plot visualization.
        Parameters
        ----------
        include_values : bool, default=True
            Includes values in confusion matrix.
        cmap : str or matplotlib Colormap, default='viridis'
            Colormap recognized by matplotlib.
        xticks_rotation : {'vertical', 'horizontal'} or float, \
                         default='horizontal'
            Rotation of xtick labels.
        values_format : str, default=None
            Format specification for values in confusion matrix. If `None`,
            the format specification is '.2g'.
        ax : matplotlib axes, default=None
            Axes object to plot on. If `None`, a new figure and axes is
            created.
        Returns
        -------
        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
        """
        check_matplotlib_support("ConfusionMatrixDisplay.plot")
        import matplotlib.pyplot as plt

        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = ax.figure

        cm = self.confusion_matrix
        n_classes = cm.shape[0]
        self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap, vmin=0, vmax=1,)
        self.text_ = None

        cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256)

        if include_values:
            self.text_ = np.empty_like(cm, dtype=object)
            if values_format is None:
                values_format = '.0%'

            # print text with appropriate color depending on background
            thresh = (cm.max() + cm.min()) / 2.0
            for i, j in product(range(n_classes), range(n_classes)):
                color = cmap_max if cm[i, j] < thresh else cmap_min
                self.text_[i, j] = ax.text(j, i,
                                           format(cm[i, j], values_format),
                                           ha="center", va="center",
                                           color=color, size= 6 )

        
        colorbar = fig.colorbar(self.im_, ax=ax)
        colorbar.ax.tick_params(labelsize=6) 
        colorbar.set_ticks([0, .25, 0.5, .75, 1])
        colorbar.set_ticklabels(['0%', '25%', '50%','75%', '100%'])
        #colorbar.set_label("Predicted value/True Value")
        ax.set(xticks=np.arange(n_classes),
               yticks=np.arange(n_classes),
               xticklabels=self.display_labels,
               yticklabels=self.display_labels,
               ylabel="True label",
               xlabel="Predicted label")

        
        ax.set_ylim((n_classes - 0.5, -0.5))
        plt.setp(ax.get_xticklabels(), rotation=xticks_rotation, fontsize=6)
        plt.setp(ax.get_yticklabels(), fontsize=6)
        ax.xaxis.get_label().set_fontsize(6)
        ax.yaxis.get_label().set_fontsize(6)
        
        self.figure_ = fig
        self.ax_ = ax

        return self



def plot_confusion_matrix(y_pred, y_true, *, labels=None,
                          sample_weight=None, normalize=None,
                          display_labels=None, include_values=True,
                          xticks_rotation='horizontal',
                          values_format=None,
                          cmap='viridis', ax=None, colorbar=False):

    check_matplotlib_support("plot_confusion_matrix")


    cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight,
                          labels=labels, normalize=normalize)

    if display_labels is None:
        display_labels = labels

    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=display_labels)
    return disp.plot(include_values=include_values,
                     cmap=cmap, ax=ax, xticks_rotation=xticks_rotation,
                     values_format=values_format)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#sns.set_context("paper")
cm = 1/2.54  # centimeters in inches
fig, ax = plt.subplots(figsize=(6*cm, 4*cm))
m = plot_confusion_matrix(np.array(y_predicted, dtype=int), np.array(y, dtype=int), display_labels=["plants", "fungi", "bacteria"], normalize = "true", ax=ax)
plt.title("MAP4 SVM Classifier\nNormalized Confusion Matrix", fontsize=6)
plt.tight_layout()
plt.savefig("plots/confusion_matrix_normtrue_right_size.png", dpi=500)
plt.savefig("plots/confusion_matrix_normtrue_right_size.svg")

In [None]:
metrics.balanced_accuracy_score(y_predicted, y)

In [None]:
metrics.matthews_corrcoef(y_predicted, y)

In [None]:
metrics.f1_score(y_predicted, y, average="micro")