In [None]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from sequence_distance_matrix import *
from mem_distance_matrix import *

import time
from time import ctime

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

db = pd.read_csv('dataset/db_modelli_smiles_pubchem.csv').drop(columns = 'Unnamed: 0')
# codifica
tm = multiclass(db['conc1_mean'].copy())
db['conc1_mean'] = tm

db = db.drop(columns = 'test_cas')

# Ordinal Encoding
encoder = OrdinalEncoder(dtype = int)

encoder.fit(db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])

db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']] = encoder.transform(
    db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])+1

# Divido il dataset completo tra dati e target
X = db.drop(columns = 'conc1_mean')
y = db['conc1_mean'].values

# splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

del db, tm, encoder

# K = 1
Cerco l'alpha ottimale per pubchem2d, tenendo il valore di hamming su categorical fisso a 0.004281332398719396 (ottimale lavoro precedente)

In [2]:
c = [0,1,0]
pub = np.logspace(-4, -3, 30)

a_ham = 0.004281332398719396

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                        sequence_pub = pub, a_ham = a_ham,
                                                        choice = c, ks = [1], leaf_size = range(60,91,10))

Tue Jul 28 13:29:44 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...

 Tue Jul 28 13:30:01 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.0001
Tue Jul 28 13:33:55 2020
Start CV...
New best params found! alpha:0.0001, k:1, leaf:60,
                                                        acc:  0.7154560000000001, st.error:  0.005603564579800967,
                                                        rmse: 0.7210867960815011, st.error:  0.01471251607404304
New best params found! alpha:0.0001, k:1, leaf:70,
                                                        acc:  0.7221119999999999, st.error:  0.001707133269548689,
                                                        rmse: 0.7107039877271717, st.error:  0.006705255936506631

 Tue Jul 28 13:34:52 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.00010826367338740541
Tue Jul 28 13:38:45 2020
Start CV...

 Tue Jul 28 13:39:40 2020
Adding Hamming 3 (Pubchem2d)... alpha = 0.00011721022975334806
Tue Jul 28 13:43:48 2020
S

In [3]:
from mem_distance_matrix import *

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

# Risultati
best_alpha = 0.0005298316906283707 
best_k = 1
best_leaf = 90

len_X_train = len(X_train) 

X_try = X_train.append(X_test)
print(ctime())
X_train_new, X_test_new = fast_dist_mat(X_try, len_X_train, categorical, non_categorical,
                                               alphas = [0.004281332398719396, 1,
                                                         best_alpha, 0], choice = [1,0])
print(ctime())

# con leaf_size = 90 è lo stesso... ho verificato
neigh123 = KNeighborsClassifier(metric = 'precomputed', n_neighbors = best_k, leaf_size = best_leaf)
neigh123.fit(X_train_new, y_train.ravel())
y_pred = neigh123.predict(X_test_new)
print('Model: {}, \n Accuracy: {},\n RMSE: {}'.format(neigh123,
                                                      accuracy_score(y_test, y_pred),
                                                      sqrt(mean_squared_error(y_test, y_pred))))

Tue Jul 28 19:12:03 2020
Start Hamming su categorical...
Start Euclidean...
Start Hamming su Pubchem2d...
Tue Jul 28 19:24:06 2020
Model: KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='precomputed',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform'), 
 Accuracy: 0.7382096920878264,
 RMSE: 0.6668723427912094


## CV

In [2]:
a_ham = 0.004281332398719396
a_pub = 0.0005298316906283707 
best_k = 1
best_leaf = 90

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
time.sleep(60)
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []

for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse))

Basic Matrix... Tue Jul 28 19:48:52 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d Tue Jul 28 19:50:07 2020
Start Hamming su Pubchem2d...
Tue Jul 28 19:59:16 2020
Tue Jul 28 20:11:44 2020 



NameError: name 'avg_auc' is not defined

In [3]:
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse))

Accuracy: 0.7438894618698052, se: 0.004277975054143412
RMSE:     0.6610992443343583, se: 0.009298303262178686


## Ricerca per hamming 1

In [3]:
c = [1,0,0]
ham = np.linspace(0.001, 0.01, 30)

a_pub = 0.0005298316906283707

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                        sequence_ham = ham, a_pub = a_pub,
                                                        choice = c, ks = [1])

Tue Jul 28 20:30:58 2020
START...
Computing Euclidean and Pubchem2d Matrix...

 Tue Jul 28 20:35:01 2020
Adding Hamming 1 (Categorical)... alpha = 0.001
Tue Jul 28 20:39:00 2020
Start CV...
New best params found! alpha:0.001, k:1, leaf:30,
                                                        acc:  0.5808, st.error:  0.004772186081870644,
                                                        rmse: 0.9121180317979001, st.error:  0.007627189343473108
New best params found! alpha:0.001, k:1, leaf:40,
                                                        acc:  0.58144, st.error:  0.0035214542450527422,
                                                        rmse: 0.902410221458681, st.error:  0.007870708463890479

 Tue Jul 28 20:40:57 2020
Adding Hamming 1 (Categorical)... alpha = 0.001310344827586207
Tue Jul 28 20:45:02 2020
Start CV...
New best params found! alpha:0.001310344827586207, k:1, leaf:60,
                                                        acc:  0.581696, st.error:  

## CV

In [None]:
a_ham = 0.005965517241379311
a_pub = 0.0005298316906283707 
best_k = 1
best_leaf = 40

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
time.sleep(60)
dist_matr += a_pub * pubchem2d_matrix(X)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

kf = KFold(n_splits=5, shuffle=True, random_state = 4565)
accs = []
rmse = []

for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse))

Basic Matrix... Tue Jul 28 23:45:32 2020
Start Hamming su categorical...
Start Euclidean...
Adding pubchem2d Tue Jul 28 23:46:47 2020
Start Hamming su Pubchem2d...
