In [1]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from scipy.spatial.distance import pdist, squareform, hamming

from rdkit.Chem import MolFromSmiles
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.DataManip.Metric.rdMetricMatrixCalc import GetTanimotoDistMat


def distance_matrix1234(X, len_X_train, cat_features = [], num_features = [], alphas = [1,1,1,1]):
    ''' inputs: matrix X [num_samples, num_features], 
                the list of the categorical features, 
                the list of the numerical features, 
                weights alphas: in position 0 there is the weight for Hamming for Categorical variable;
                                in position 1 there is the weight for Euclidean for Interval variable;
                                in position 2 there is the weight for Hamming for Pubchem2d;
                                in position 3 there is the weight for Tanimoto for SMILES.
                
        output: distance matrix
    '''
    ##################################################################
    ################ STAMPA DEI PESI #################################
    ##################################################################
    
    # Training
    X_cat = X[cat_features]
    X_num = X[num_features]
    
    if alphas[0] != 0:
        print('Inizio Hamming per variabili categoriche...')
        dist_matr = alphas[0] * squareform(pdist(X_cat, metric = "hamming"))
        
    if alphas[1] != 0:
        print('Fine Hamming per variabili categoriche... inizio Euclidean per variabili continue...')
        dist_matr += alphas[1] * squareform(pdist(X_num, metric = "euclidean"))
    
    
    
    if (alphas[2] != 0) & (alphas[0] != 0):
        print('Fine Euclidean per variabili continue... inizio Hamming su pubchem2d...')
        a = np.array((X.pubchem2d[0].replace('', ' ').strip().split(' '),
                  X.pubchem2d[1].replace('', ' ').strip().split(' ')))
        for i in range(2,len(X.pubchem2d)):
            a = np.concatenate((a,[X.pubchem2d[i].replace('', ' ').strip().split(' ')]))

        dist_matr += alphas[2] * squareform(pdist(a, metric = 'hamming'))
        
    elif (alphas[2] != 0) & (alphas[0] == 0):
        print('Inizio Hamming su pubchem2d... ')
        a = np.array((X.pubchem2d[0].replace('', ' ').strip().split(' '),
                  X.pubchem2d[1].replace('', ' ').strip().split(' ')))
        for i in range(2,len(X.pubchem2d)):
            a = np.concatenate((a,[X.pubchem2d[i].replace('', ' ').strip().split(' ')]))

        dist_matr = alphas[2] * squareform(pdist(a, metric = 'hamming'))
    
    
    
    if (alphas[3] != 0) & (alphas[0] != 0):
        print('Fine Hamming su pubchem2d ... inizio Tanimoto su SMILES...')
        dist_matr += alphas[3]*squareform(GetTanimotoDistMat([FingerprintMols.FingerprintMol(MolFromSmiles(X.smiles[i]))
                                                     for i in range(len(X.smiles))]))
    
    elif (alphas[3] != 0) & (alphas[0] == 0) & (alphas[2] != 0):
        print('Fine Hamming su pubchem2d... inizio Tanimoto su SMILES')
        dist_matr += alphas[3]*squareform(GetTanimotoDistMat([FingerprintMols.FingerprintMol(MolFromSmiles(X.smiles[i]))
                                                     for i in range(len(X.smiles))]))
    
    elif (alphas[3] != 0) & (alphas[0] == 0) & (alphas[2] == 0):
        print('Inizio Tanimoto su SMILES...')
        dist_matr = alphas[3]*squareform(GetTanimotoDistMat([FingerprintMols.FingerprintMol(MolFromSmiles(X.smiles[i]))
                                                     for i in range(len(X.smiles))]))
    
    
    print('Fine')
    
    dist_matr_train = dist_matr[:len_X_train,:len_X_train]
    dist_matr_test = dist_matr[len_X_train:,:len_X_train]

    return dist_matr_train, dist_matr_test

def import_data_encoded():
    db = pd.read_csv('dataset/db_modelli_smiles_pubchem.csv').drop(columns = 'Unnamed: 0')
    # codifica
    db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 1, 0)

    db = db.drop(columns = 'test_cas')
    # print(db.info())

    # Ordinal Encoding
    encoder = OrdinalEncoder(dtype = int)

    encoder.fit(db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])

    db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']] = encoder.transform(
        db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])+1
    
    # Divido il dataset completo tra dati e target
    X = db.drop(columns = 'conc1_mean')
    y = db['conc1_mean'].values

    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # ricongiungo train con test
    X_try = X_train.append(X_test)

    # tengo traccia della lunghezza del train set
    len_X_train = len(X_train) 

    return X_try, X_train, X_test, y_train, y_test, len_X_train

In [2]:
X_try, X_train, X_test, y_train, y_test, len_X_train = import_data_encoded()

# KNN best (lavoro precedente)

In [3]:
categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

# alphas presi di default [1,1,1,1]
X_train_new, X_test_new = distance_matrix1234(X_try, len_X_train, categorical,non_categorical,
                                              alphas = [0.0016102620275609393, 1, 0, 0])

Inizio Hamming per variabili categoriche...
Fine Hamming per variabili categoriche... inizio Euclidean per variabili continue...
Fine


In [4]:
neigh = KNeighborsClassifier(metric = 'precomputed', n_neighbors = 1, leaf_size = 70)
neigh.fit(X_train_new, y_train.ravel())
y_pred = neigh.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))


Model: KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform'), 
 Accuracy: 0.9048980122125504,
 RMSE: 0.30838610180656595


# KNN -- only Hamming pubchem2d

In [19]:
categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

X_train_new, X_test_new = distance_matrix1234(X_try, len_X_train, categorical,non_categorical,
                                              alphas = [0, 0, 1, 0])

Inizio Hamming su pubchem2d... 
Fine


In [25]:
neigh_3 = KNeighborsClassifier(metric = 'precomputed')
neigh_3.fit(X_train_new, y_train.ravel())
y_pred = neigh_3.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh_3,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))


Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'), 
 Accuracy: 0.5676237495127971,
 RMSE: 0.6575532301549456


# KNN only Tanimoto SMILES

In [3]:
categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

X_train_new, X_test_new = distance_matrix1234(X_try, len_X_train, categorical,non_categorical,
                                              alphas = [0, 0, 0, 1])

Inizio Tanimoto su SMILES...
Fine


In [4]:
neigh_4 = KNeighborsClassifier(metric = 'precomputed')
neigh_4.fit(X_train_new, y_train.ravel())
y_pred = neigh_4.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh_4,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'), 
 Accuracy: 0.5473561127712095,
 RMSE: 0.6727881443878083


# Prova tenere tutte le matrici in memoria

In [4]:
categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']
X_cat = X_try[categorical]
ham1 = squareform(pdist(X_cat, metric = "hamming"))

del X_cat, categorical

In [7]:
non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']
X_num = X_try[non_categorical]
euc2 = squareform(pdist(X_num, metric = "euclidean"))

del X_num, non_categorical

In [12]:
a = np.array((X_try.pubchem2d[0].replace('', ' ').strip().split(' '),
                  X_try.pubchem2d[1].replace('', ' ').strip().split(' ')))
for i in range(2,len(X_try.pubchem2d)):
    a = np.concatenate((a,[X_try.pubchem2d[i].replace('', ' ').strip().split(' ')]))

ham3 = squareform(pdist(a, metric = 'hamming'))

del a, i 

In [18]:
tan4 = squareform(GetTanimotoDistMat([FingerprintMols.FingerprintMol(MolFromSmiles(X_try.smiles[i]))
                                                     for i in range(len(X_try.smiles))]))

Ci stanno tutte e 4 in memoria ma sono molto dispendiose

In [26]:
print(tan4.shape)
print(ham3.shape)
print(euc2.shape)
print(ham1.shape)

(23322, 23322)
(23322, 23322)
(23322, 23322)
(23322, 23322)


# KNN 1+2 con funzione mia

In [4]:
from mem_distance_matrix import *

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']
non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

X_train_new, X_test_new = train_test_distances(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.0016102620275609393, 1, 0, 0], choice = [0,0])

In [6]:
neigh = KNeighborsClassifier(metric = 'precomputed', n_neighbors = 1, leaf_size = 70)
neigh.fit(X_train_new, y_train.ravel())
y_pred = neigh.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Model: KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform'), 
 Accuracy: 0.9048980122125504,
 RMSE: 0.30838610180656595


# KNN 1+2+3

In [3]:
from mem_distance_matrix import *

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']
non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

X_train_new, X_test_new = train_test_distances(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.0016102620275609393, 1, 1, 0], choice = [1,0])

Start...
You choose Hamming 1, Euclidean 2 and Hamming on pubchem2d 3...
Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Combining...
...FINISH


In [4]:
neigh123 = KNeighborsClassifier(metric = 'precomputed')
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'), 
 Accuracy: 0.8299337404183448,
 RMSE: 0.4123909062790488


# KNN 1+2+4

In [3]:
from mem_distance_matrix import *

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']
non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

X_train_new, X_test_new = train_test_distances(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.0016102620275609393, 1, 0, 1], choice = [0,1])

Start...
You choose Hamming 1, Euclidean 2, Tanimoto 4...
Start Hamming su categorical...
Start Euclidean...
Start Tanimoto...
Combining...
...FINISH


In [4]:
neigh124 = KNeighborsClassifier(metric = 'precomputed')
neigh124.fit(X_train_new, y_train.ravel())
y_pred = neigh124.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh124,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'), 
 Accuracy: 0.7305443679355593,
 RMSE: 0.5190911596862738
