In [1]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from sequence_distance_matrix import *
from mem_distance_matrix import *

db = pd.read_csv('dataset/db_modelli_smiles_pubchem.csv').drop(columns = 'Unnamed: 0')
# codifica
db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 1, 0)

db = db.drop(columns = 'test_cas')

# Ordinal Encoding
encoder = OrdinalEncoder(dtype = int)

encoder.fit(db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])

db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']] = encoder.transform(
    db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])+1

# Divido il dataset completo tra dati e target
X = db.drop(columns = 'conc1_mean')
y = db['conc1_mean'].values

# splitting
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# K = 1
Parto da \alpha_1 = 0.00161, \alpha_2 = 1, \alpha_3 = 0.

## Cerco alpha per pubchem2d

In [2]:
# due ore circa--> con 30 alpha
c = [1,0]
pub = np.logspace(-3, 0, 30) 

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, choice = c, leaf_size = range(60,90,10))

Wed Jul 22 03:24:51 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...

 Wed Jul 22 03:25:10 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.001
Wed Jul 22 03:30:45 2020
Start CV...
New best params found! alpha:0.001, k:1, leaf:60,
                                                        acc:  0.892288, st.error:  0.0028596615184318702,
                                                        rmse: 0.3280793068895179, st.error:  0.004357992395975092
New best params found! alpha:0.001, k:1, leaf:70,
                                                        acc:  0.8961279999999998, st.error:  0.0030181954873732034,
                                                        rmse: 0.32215474943730094, st.error:  0.004698867283449149

 Wed Jul 22 03:33:05 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.0012689610031679222
Wed Jul 22 03:37:13 2020
Start CV...
New best params found! alpha:0.0012689610031679222, k:1, leaf:70,
                                                        acc:

In [19]:
from mem_distance_matrix import *

# Risultati
best_alpha = 0.0727895384398315 
best_k = 1
best_leaf = 60

len_X_train = len(X_train) 

X_try = X_train.append(X_test)

X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.0016102620275609393, 1,
                                                         best_alpha, 0], choice = [1,0])

neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = best_k, leaf_size = best_leaf)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Model: KNeighborsClassifier(algorithm='auto', leaf_size=60, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform'), 
 Accuracy: 0.9052877744575809,
 RMSE: 0.3077535142649376


## CV -- Accuracy/RMSE/AUC

In [2]:
from sklearn.metrics import roc_auc_score
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_alpha = 0.0727895384398315 
best_k = 1
best_leaf = 60

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical)
print('Adding pubchem2d', ctime())
dist_matr += best_alpha * pubchem2d_matrix(X)

Wed Jul 22 13:20:23 2020
Wed Jul 22 13:34:57 2020


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [3]:
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)
print(ctime())

Wed Jul 22 13:37:00 2020
Wed Jul 22 13:50:34 2020


In [8]:
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Accuracy: 0.9062690393591929, se: 0.00220786432932479
RMSE:     0.3060689847977713, se: 0.0036310186473209444
AUC:      0.8985896903200068, se: 0.0021606746083646243


## Cerco best alpha per hamming 1 tenendo i risultati precedenti per pubchem2d

In [5]:
c = [1,0,0]
ham = np.linspace(0.001,0.002,20)
a_pub = 0.0727895384398315

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                        sequence_ham = ham, a_pub = a_pub,
                                                        choice = c, ks = [1], leaf_size = range(60,91,10))

Sat Jul 25 12:50:18 2020
START...
Computing Euclidean and Pubchem2d Matrix...

 Sat Jul 25 12:54:20 2020
Adding Hamming 1 (Categorical)... alpha = 0.001
Sat Jul 25 12:58:16 2020
Start CV...
New best params found! alpha:0.001, k:1, leaf:60,
                                                        acc:  0.831936, st.error:  0.004304446073538377,
                                                        rmse: 0.4098228536802952, st.error:  0.005225624396648336
New best params found! alpha:0.001, k:1, leaf:70,
                                                        acc:  0.834112, st.error:  0.0029460862173398853,
                                                        rmse: 0.40722837677170765, st.error:  0.003641742435980182

 Sat Jul 25 12:59:12 2020
Adding Hamming 1 (Categorical)... alpha = 0.0010526315789473684
Sat Jul 25 13:03:07 2020
Start CV...
New best params found! alpha:0.0010526315789473684, k:1, leaf:80,
                                                        acc:  0.835008, st.e

In [6]:
from mem_distance_matrix import *

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

# Risultati
best_alpha = 0.0015789473684210526 
best_k = 1
best_leaf = 70

len_X_train = len(X_train) 

X_try = X_train.append(X_test)
print(ctime())
X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [best_alpha, 1,
                                                         0.0727895384398315, 0], choice = [1,0])
print(ctime())


neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = 1, leaf_size = 70)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Sat Jul 25 14:41:05 2020
Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Sat Jul 25 14:54:02 2020
Model: KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform'), 
 Accuracy: 0.9052877744575809,
 RMSE: 0.3077535142649376


## CV

In [2]:
from sklearn.metrics import roc_auc_score
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

a_ham = 0.0015789473684210526
a_pub = 0.0727895384398315 
best_k = 1
best_leaf = 70

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Basic Matrix... Sat Jul 25 15:01:22 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d Sat Jul 25 15:02:38 2020
Start Hamming su Pubchem2d...
Sat Jul 25 15:10:13 2020
Sat Jul 25 15:28:57 2020 

Accuracy: 0.9057543217162219, se: 0.001915720845614272
RMSE:     0.3069316771379575, se: 0.0031074045865791262
AUC:      0.8976377020799833, se: 0.0016731825438612504


## Ricerco ancora per pubchem2d

In [5]:
c = [0,1,0]
pub = np.linspace(0.01,0.1,30)
a_ham = 0.0015789473684210526

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                        sequence_pub = pub, a_ham = a_ham,
                                                        choice = c, ks = [1], leaf_size = range(60,91,10))

Mon Jul 27 17:25:27 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...

 Mon Jul 27 17:25:45 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.01
Mon Jul 27 17:29:43 2020
Start CV...
New best params found! alpha:0.01, k:1, leaf:60,
                                                        acc:  0.892352, st.error:  0.0028435020661149593,
                                                        rmse: 0.32798264714745107, st.error:  0.004341174096674622
New best params found! alpha:0.01, k:1, leaf:70,
                                                        acc:  0.8962559999999999, st.error:  0.0029751396605873707,
                                                        rmse: 0.3219598590079401, st.error:  0.004632741833605954
New best params found! alpha:0.01, k:1, leaf:90,
                                                        acc:  0.89664, st.error:  0.0022193332332031634,
                                                        rmse: 0.32142241043346254, st.error:  0.00345

In [6]:
from mem_distance_matrix import *

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

# Risultati
best_alpha = 0.016206896551724137 
best_k = 1
best_leaf = 80

len_X_train = len(X_train) 

X_try = X_train.append(X_test)
print(ctime())
X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.0015789473684210526, 1,
                                                         best_alpha, 0], choice = [1,0])
print(ctime())


neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = best_k, leaf_size = best_leaf)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Mon Jul 27 20:00:12 2020
Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Mon Jul 27 20:16:10 2020
Model: KNeighborsClassifier(algorithm='auto', leaf_size=80, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform'), 
 Accuracy: 0.9055476159542679,
 RMSE: 0.3073310658650246


# CV

In [2]:
from sklearn.metrics import roc_auc_score
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

a_ham = 0.0015789473684210526
a_pub = 0.016206896551724137 
best_k = 1
best_leaf = 80

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Basic Matrix... Mon Jul 27 20:18:58 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d Mon Jul 27 20:20:06 2020
Start Hamming su Pubchem2d...
Mon Jul 27 20:27:41 2020
Mon Jul 27 20:53:32 2020 

Accuracy: 0.9055828318984298, se: 0.001944026299565914
RMSE:     0.3072092898502924, se: 0.0031472341527616823
AUC:      0.8975293305137708, se: 0.0017282183728871917


# K = 3

In [4]:
c = [1,0]
pub = np.logspace(-3, 0, 30) 

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, choice = c, ks = [3],
                                                    leaf_size = range(60,90,10))

Sat Jul 25 17:17:40 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...

 Sat Jul 25 17:17:57 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.001
Sat Jul 25 17:21:52 2020
Start CV...
New best params found! alpha:0.001, k:3, leaf:60,
                                                        acc:  0.8867200000000001, st.error:  0.0013462837739496075,
                                                        rmse: 0.33654730139121186, st.error:  0.001994613138732066

 Sat Jul 25 17:22:39 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.0012689610031679222
Sat Jul 25 17:26:32 2020
Start CV...
New best params found! alpha:0.0012689610031679222, k:3, leaf:60,
                                                        acc:  0.88704, st.error:  0.0024890158697766602,
                                                        rmse: 0.3360127335601656, st.error:  0.003722998973447065
New best params found! alpha:0.0012689610031679222, k:3, leaf:70,
                                             

In [5]:
from mem_distance_matrix import *

best_alpha = 0.005298316906283708
best_k = 3
best_leaf = 80

len_X_train = len(X_train) 

X_try = X_train.append(X_test)

X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.0016102620275609393, 1,
                                                         best_alpha, 0], choice = [1,0])

neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = best_k, leaf_size = best_leaf)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Model: KNeighborsClassifier(algorithm='auto', leaf_size=80, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform'), 
 Accuracy: 0.8942445108483825,
 RMSE: 0.32520069057678447


## CV

In [2]:
from sklearn.metrics import roc_auc_score
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

a_ham = 0.0016102620275609393
a_pub = 0.005298316906283708 
best_k = 3
best_leaf = 80

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d...', ctime())
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Basic Matrix... Mon Jul 27 12:44:32 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d... Mon Jul 27 12:46:17 2020
Start Hamming su Pubchem2d...
Mon Jul 27 12:59:55 2020
Mon Jul 27 13:11:01 2020 

Accuracy: 0.9042540340001362, se: 0.003090439448746397
RMSE:     0.30926581880645526, se: 0.0050154590477032995
AUC:      0.8960891603426262, se: 0.0036815179534981732


## Cerco best alpha per hamming 1 tenendo i risultati precedenti per pubchem2d

In [4]:
c = [1,0,0]
ham = np.linspace(0.001,0.002,20)
a_pub = 0.005298316906283708

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                        sequence_ham = ham, a_pub = a_pub,
                                                        choice = c, ks = [3], leaf_size = range(60,91,10))

Mon Jul 27 13:15:02 2020
START...
Computing Euclidean and Pubchem2d Matrix...

 Mon Jul 27 13:19:50 2020
Adding Hamming 1 (Categorical)... alpha = 0.001
Mon Jul 27 13:24:54 2020
Start CV...
New best params found! alpha:0.001, k:3, leaf:60,
                                                        acc:  0.8464640000000001, st.error:  0.0029905879020687524,
                                                        rmse: 0.39176264633195723, st.error:  0.0038088364030831492
New best params found! alpha:0.001, k:3, leaf:70,
                                                        acc:  0.849344, st.error:  0.006703522656036896,
                                                        rmse: 0.38775660444978394, st.error:  0.008672019741296864

 Mon Jul 27 13:26:20 2020
Adding Hamming 1 (Categorical)... alpha = 0.0010526315789473684
Mon Jul 27 13:32:02 2020
Start CV...

 Mon Jul 27 13:33:24 2020
Adding Hamming 1 (Categorical)... alpha = 0.0011052631578947368
Mon Jul 27 13:38:04 2020
Start CV...
Ne

In [2]:
categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_alpha = 0.002 
best_k = 3
best_leaf = 70

len_X_train = len(X_train) 

X_try = X_train.append(X_test)
print(ctime())
X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [best_alpha, 1,
                                                         0.005298316906283708, 0], choice = [1,0])
print(ctime())


neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = best_k, leaf_size = best_leaf)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Mon Jul 27 15:34:00 2020
Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Mon Jul 27 15:46:03 2020
Model: KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform'), 
 Accuracy: 0.8942445108483825,
 RMSE: 0.32520069057678447


# CV

In [2]:
from sklearn.metrics import roc_auc_score
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

a_ham = 0.002 
a_pub = 0.005298316906283708 
best_k = 3
best_leaf = 70

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d...', ctime())
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Basic Matrix... Mon Jul 27 21:31:48 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d... Mon Jul 27 21:33:11 2020
Start Hamming su Pubchem2d...
Mon Jul 27 21:43:15 2020
Mon Jul 27 22:02:40 2020 

Accuracy: 0.9042111523534808, se: 0.0030947805626731096
RMSE:     0.30933465415307415, se: 0.005022932074924381
AUC:      0.8960310545901568, se: 0.0036930020389074752


## Ricerco alpha per pubchem2d

In [2]:
c = [0,1,0]
pub = np.linspace(0.001, 0.01, 30) 
a_ham = 0.02

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, a_ham = a_ham, choice = c, ks = [3],
                                                    leaf_size = range(60,90,10))

Mon Jul 27 22:27:54 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...

 Mon Jul 27 22:28:12 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.001
Mon Jul 27 22:32:06 2020
Start CV...
New best params found! alpha:0.001, k:3, leaf:60,
                                                        acc:  0.8876799999999999, st.error:  0.0011222833866720054,
                                                        rmse: 0.3351249984881434, st.error:  0.001675961539076935

 Mon Jul 27 22:32:50 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.001310344827586207
Mon Jul 27 22:36:48 2020
Start CV...
New best params found! alpha:0.001310344827586207, k:3, leaf:60,
                                                        acc:  0.8880000000000001, st.error:  0.0031435012327021643,
                                                        rmse: 0.334530687010275, st.error:  0.004722802357536543
New best params found! alpha:0.001310344827586207, k:3, leaf:70,
                                       

In [3]:
categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_alpha = 0.0034827586206896553 
best_k = 3
best_leaf = 60

len_X_train = len(X_train) 

X_try = X_train.append(X_test)
print(ctime())
X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.02, 1,
                                                         best_alpha, 0], choice = [1,0])
print(ctime())


neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = best_k, leaf_size = best_leaf)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Tue Jul 28 01:03:20 2020
Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Tue Jul 28 01:17:03 2020
Model: KNeighborsClassifier(algorithm='auto', leaf_size=60, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform'), 
 Accuracy: 0.894374431596726,
 RMSE: 0.32500087446539894


## CV

In [2]:
from sklearn.metrics import roc_auc_score
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

a_ham = 0.002 
a_pub = 0.0034827586206896553 
best_k = 3
best_leaf = 60

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d...', ctime())
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 5647)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Basic Matrix... Tue Jul 28 02:10:09 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d... Tue Jul 28 02:11:14 2020
Start Hamming su Pubchem2d...
Tue Jul 28 02:20:23 2020
Tue Jul 28 02:29:35 2020 

Accuracy: 0.9052827706783297, se: 0.0010420915607180581
RMSE:     0.307743068548686, se: 0.0016906420310792765
AUC:      0.896533109555841, se: 0.0008938055929670887
