# First Models

- **Categorical features**: conc1_type, exposure_type, control_type, media_type, application_freq_unit, class, tax_order, family, genus, species

- **Non Categorical features**: obs_duration_mean, conc1_mean, atom_number, alone_atom_number, bonds_number, doubleBond, tripleBond, ring_number, Mol, MorganDensity, LogP, oh_count

It turns out that *obs_duration_mean* have to be considered as a categorical feature in order to maxime the metrics.

## BINARY

In [1]:
from helper_knn import *

X_try, X_train, X_test, y_train, y_test, len_X_train = load_data_knn('data/lc_db_processed.csv',
                                                                     encoding = 'binary', seed = 42)

# Best combination
categorical = ['class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
               'application_freq_unit',"exposure_type", "conc1_type", 'obs_duration_mean']

non_categorical = ['ring_number', 'tripleBond', 'doubleBond', 'alone_atom_number', 'oh_count',
                   'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

# OLD and NOT GOLD
# categorical = ['class', 'tax_order', 'family', 'genus', 'species', 'control_type', 'media_type',
#                'application_freq_unit', 'exposure_type', 'conc1_type']

# non_categorical = ['ring_number', 'tripleBond',  'doubleBond', 'alone_atom_number', 'oh_count',
#                    'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP', 'obs_duration_mean']

### Finding the best alpha_1 for the problem

START: alpha_1 = 0, alpha_2 = 1, alpha_3 = 0

END: alpha_1 = 0.0069519279617756054, alpha_2 = 1, alpha_3 = 0

In [2]:
c = [0,0]
ham = np.logspace(-3, -1, 20) 

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_ham = ham, choice = c, ks = [1])

Mon Sep 14 11:08:06 2020
START...
Computing Euclidean ...
Adding Hamming 1 (Categorical)... alpha = 0.001
Start CV...
New best params found! alpha:0.001, k:1, leaf:10,
                                                        acc:  0.8976980539183801, st.error:  0.0026286215760152913,
                                                        rmse: 0.31974227013083456, st.error:  0.004087382206260621
New best params found! alpha:0.001, k:1, leaf:40,
                                                        acc:  0.8984766658275387, st.error:  0.0013155183512758776,
                                                        rmse: 0.3186007056220668, st.error:  0.0020569728718740753
New best params found! alpha:0.001, k:1, leaf:80,
                                                        acc:  0.8997000774068564, st.error:  0.001976147553967034,
                                                        rmse: 0.3166398848354725, st.error:  0.003126736498423271
Adding Hamming 1 (Categorical)... alpha =

### Finding the best alpha_3, fixing best_alpha_1
START: alpha_1 = 0.0069519279617756054, alpha_2 = 1, alpha_3 = 0

END: alpha_1 = 0.0069519279617756054, alpha_2 = 1, alpha_3 = 0.0069519279617756054

In [2]:
c = [0,1]
al_ham = 0.0069519279617756054
pub = np.logspace(-3, -1, 20)

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, a_ham = al_ham, choice = c, ks = [1])

Mon Sep 14 12:29:06 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...
Adding Hamming 3 (Pubchem2d)... alpha = 0.001
Start CV...
New best params found! alpha:0.001, k:1, leaf:10,
                                                        acc:  0.8977536249017346, st.error:  0.002539752081748882,
                                                        rmse: 0.3196624298760987, st.error:  0.003946708247816731
New best params found! alpha:0.001, k:1, leaf:40,
                                                        acc:  0.8985878387013575, st.error:  0.0013189399299914412,
                                                        rmse: 0.31842614944471437, st.error:  0.0020584368131552197
New best params found! alpha:0.001, k:1, leaf:80,
                                                        acc:  0.8998112657342301, st.error:  0.0019271779321472723,
                                                        rmse: 0.3164673314241388, st.error:  0.0030480488453350613
Adding Hammin

### Finding again the best alpha_1, fixing best_alpha_3

START: alpha_1 = 0.0069519279617756054, alpha_2 = 1, alpha_3 = 0.0069519279617756054

END: alpha_1 = 0.009473684210526315, alpha_2 = 1, alpha_3 = 0.0069519279617756054

In [2]:
c = [1,0]
al_pub = 0.0069519279617756054

ham = np.linspace(0.005,0.01,20)

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_ham = ham, a_pub = al_pub, choice = c, ks = [1])

Mon Sep 14 16:51:44 2020
START...
Computing Euclidean and Pubchem2d Matrix...
Adding Hamming 1 (Categorical)... alpha = 0.005
Start CV...
New best params found! alpha:0.005, k:1, leaf:10,
                                                        acc:  0.8245314057187116, st.error:  0.005742672577797003,
                                                        rmse: 0.4186730767961923, st.error:  0.006735151210219632
New best params found! alpha:0.005, k:1, leaf:30,
                                                        acc:  0.827031914527006, st.error:  0.0024800733461741212,
                                                        rmse: 0.4158523496638812, st.error:  0.0029541814442851045
Adding Hamming 1 (Categorical)... alpha = 0.005263157894736842
Start CV...
Adding Hamming 1 (Categorical)... alpha = 0.005526315789473685
Start CV...
Adding Hamming 1 (Categorical)... alpha = 0.005789473684210527
Start CV...
New best params found! alpha:0.005789473684210527, k:1, leaf:30,
             

### Finding again the best alpha_3, fixing best_alpha_1

START: alpha_1 = 0.009473684210526315, alpha_2 = 1, alpha_3 = 0.0069519279617756054

END: alpha_1 = 0.009473684210526315, alpha_2 = 1, alpha_3 = 0.007105263157894737

In [2]:
c = [0,1]
al_ham = 0.009473684210526315
pub = np.linspace(0.005,0.01,20)

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, a_ham = al_ham, choice = c, ks = [1])

Mon Sep 14 21:28:13 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...
Adding Hamming 3 (Pubchem2d)... alpha = 0.005
Start CV...
New best params found! alpha:0.005, k:1, leaf:10,
                                                        acc:  0.8977536249017346, st.error:  0.0025060535258592765,
                                                        rmse: 0.3196648161329612, st.error:  0.003898090075453123
New best params found! alpha:0.005, k:1, leaf:40,
                                                        acc:  0.898532267718003, st.error:  0.0014565415219307638,
                                                        rmse: 0.3185074989524941, st.error:  0.002275158951814106
New best params found! alpha:0.005, k:1, leaf:80,
                                                        acc:  0.899588858172373, st.error:  0.0019329533882988246,
                                                        rmse: 0.3168182050385376, st.error:  0.0030564188107250296
Adding Hamming 3

## Final model -- BINARY

In [2]:
y = np.append(y_train,y_test)

del X_train, X_test, y_train, y_test

a_ham = 0.009473684210526315
a_pub = 0.007105263157894737
best_k = 1
best_leaf = 40

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X_try, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X_try)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

Basic Matrix... Tue Sep 15 01:06:53 2020
Adding pubchem2d Tue Sep 15 01:09:32 2020
Tue Sep 15 01:22:20 2020


In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
sens = []
precs = []
specs = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    sens.append(recall_score(y_test, y_pred))
    precs.append(precision_score(y_test, y_pred))
    specs.append(tn/(tn+fp))
    
    del X_train, X_test, y_train, y_test
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_sens = np.mean(sens)
se_sens = sem(sens)

avg_precs = np.mean(precs)
se_precs = sem(precs)

avg_specs = np.mean(specs)
se_specs = sem(specs)

print('''Accuracy: \t {}, se: {}
RMSE: \t\t {}, se: {}
Sensitivity: \t {}, se: {}
Precision: \t {}, se: {}
Specificity: \t {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_sens, se_sens,
                                 avg_precs, se_precs, avg_specs, se_specs))

Accuracy: 	 0.9105265658811724, se: 0.001596093283967252
RMSE: 		 0.29907364869243597, se: 0.0026639621159533205
Sensitivity: 	 0.9305930270523051, se: 0.0025662688577057476
Precision: 	 0.9257884455755866, se: 0.0008814897322470156
Specificity: 	 0.877670405685049, se: 0.0016028283693755635
