# First Models

- **Categorical features**: conc1_type, exposure_type, control_type, media_type, application_freq_unit, class, tax_order, family, genus, species

- **Non Categorical features**: obs_duration_mean, conc1_mean, atom_number, alone_atom_number, bonds_number, doubleBond, tripleBond, ring_number, Mol, MorganDensity, LogP, oh_count

It turns out that *obs_duration_mean* have to be considered as a categorical feature in order to maxime the metrics.

## BINARY

In [1]:
from helper_knn import *

X_try, X_train, X_test, y_train, y_test, len_X_train = load_data_knn('data/lc_db_processed.csv',
                                                                     encoding = 'binary', seed = 42)

# Best combination
# categorical = ['class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
#                'application_freq_unit',"exposure_type", "conc1_type", 'obs_duration_mean']

# non_categorical = ['ring_number', 'tripleBond', 'doubleBond', 'alone_atom_number', 'oh_count',
#                    'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

categorical = ['class', 'tax_order', 'family', 'genus', 'species', 'control_type', 'media_type',
               'application_freq_unit', 'exposure_type', 'conc1_type']

non_categorical = ['ring_number', 'tripleBond',  'doubleBond', 'alone_atom_number', 'oh_count',
                   'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP', 'obs_duration_mean']

### Finding the best alpha_1 for the problem

START: alpha_1 = 0, alpha_2 = 1, alpha_3 = 0

END: alpha_1 = 0.007880462815669913, alpha_2 = 1, alpha_3 = 0

In [10]:
c = [0,0]
ham = np.logspace(-3, -1, 30) 

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_ham = ham, choice = c, ks = [1])

Wed Sep  9 17:10:11 2020
START...
Computing Euclidean ...
Adding Hamming 1 (Categorical)... alpha = 0.001
Start CV...
New best params found! alpha:0.001, k:1, leaf:10,
                                                        acc:  0.849216296144508, st.error:  0.004163826444549491,
                                                        rmse: 0.38815851135308044, st.error:  0.005400785120065688
New best params found! alpha:0.001, k:1, leaf:40,
                                                        acc:  0.8493273762969975, st.error:  0.0028768087742139,
                                                        rmse: 0.38809394752254917, st.error:  0.003732010161002568
New best params found! alpha:0.001, k:1, leaf:50,
                                                        acc:  0.8512734270096924, st.error:  0.0025523525709176295,
                                                        rmse: 0.38559334997435263, st.error:  0.0033294686461991735
Adding Hamming 1 (Categorical)... alpha = 0

### Finding the best alpha_3, fixing best_alpha_1
START: alpha_1 = 0.007880462815669913, alpha_2 = 1, alpha_3 = 0

END: alpha_1 = 0.007880462815669913, alpha_2 = 1, alpha_3 = 0.007880462815669913

In [2]:
c = [0,1]
al_ham = 0.007880462815669913
pub = np.logspace(-3, -1, 30)

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, a_ham = al_ham, choice = c, ks = [1])

Wed Sep  9 22:27:51 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...
Adding Hamming 3 (Pubchem2d)... alpha = 0.001
Start CV...
New best params found! alpha:0.001, k:1, leaf:10,
                                                        acc:  0.8499946917038981, st.error:  0.004072696390337732,
                                                        rmse: 0.3871599836040902, st.error:  0.00530224932888054
New best params found! alpha:0.001, k:1, leaf:50,
                                                        acc:  0.8514958500251044, st.error:  0.002467863015571247,
                                                        rmse: 0.385308158201802, st.error:  0.0032316094299440785
Adding Hamming 3 (Pubchem2d)... alpha = 0.0011721022975334804
Start CV...
Adding Hamming 3 (Pubchem2d)... alpha = 0.0013738237958832624
Start CV...
Adding Hamming 3 (Pubchem2d)... alpha = 0.0016102620275609393
Start CV...
New best params found! alpha:0.0016102620275609393, k:1, leaf:30,
          

### Finding again the best alpha_1, fixing best_alpha_3

START: alpha_1 = 0.007880462815669913, alpha_2 = 1, alpha_3 = 0.007880462815669913

END: alpha_1 = 0.008620689655172414, alpha_2 = 1, alpha_3 = 0.007880462815669913

In [2]:
c = [1,0]
al_pub = 0.007880462815669913

ham = np.linspace(0.005,0.01,30)

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_ham = ham, a_pub = al_pub, choice = c, ks = [1])

Thu Sep 10 11:11:07 2020
START...
Computing Euclidean and Pubchem2d Matrix...
Adding Hamming 1 (Categorical)... alpha = 0.005
Start CV...
New best params found! alpha:0.005, k:1, leaf:10,
                                                        acc:  0.8119654557415596, st.error:  0.0036428205032030276,
                                                        rmse: 0.4335484929433071, st.error:  0.004190719658587571
New best params found! alpha:0.005, k:1, leaf:30,
                                                        acc:  0.8126320757384906, st.error:  0.0029976566349513394,
                                                        rmse: 0.43280549797132634, st.error:  0.0034396652199607976
New best params found! alpha:0.005, k:1, leaf:50,
                                                        acc:  0.8143563524850785, st.error:  0.004578153158321574,
                                                        rmse: 0.4307330207814454, st.error:  0.00530830301067345
Adding Hamming 1 (Cate

### Finding again the best alpha_3, fixing best_alpha_1

START: alpha_1 = 0.008620689655172414, alpha_2 = 1, alpha_3 = 0.007880462815669913

END: alpha_1 = 0.008620689655172414, alpha_2 = 1, alpha_3 = 0.008736842105263157

In [2]:
c = [0,1]
al_ham = 0.008620689655172414
pub = np.linspace(0.006, 0.01, 20)

best_acc, best_alpha, best_k, best_leaf = cv_params_new(X_train, y_train, categorical, non_categorical,
                                                    sequence_pub = pub, a_ham = al_ham, choice = c, ks = [1])

Thu Sep 10 16:38:21 2020
START...
Computing Basic Matrix: Hamming 1 and Euclidean 2...
Adding Hamming 3 (Pubchem2d)... alpha = 0.006
Start CV...
New best params found! alpha:0.006, k:1, leaf:10,
                                                        acc:  0.850161481921736, st.error:  0.004119366050285759,
                                                        rmse: 0.3869410208922664, st.error:  0.005365734551638392
New best params found! alpha:0.006, k:1, leaf:50,
                                                        acc:  0.8515514519155687, st.error:  0.002471883245055704,
                                                        rmse: 0.3852358349083632, st.error:  0.003236494511341651
Adding Hamming 3 (Pubchem2d)... alpha = 0.006210526315789474
Start CV...
New best params found! alpha:0.006210526315789474, k:1, leaf:20,
                                                        acc:  0.8516064974780571, st.error:  0.0021877007622684587,
                                            

## Final model -- BINARY

In [5]:
y = np.append(y_train,y_test)

del X_train, X_test, y_train, y_test

a_ham = 0.008620689655172414
a_pub = 0.008736842105263157
best_k = 1
best_leaf = 10

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X_try, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X_try)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

Basic Matrix... Fri Sep 11 09:28:15 2020
Adding pubchem2d Fri Sep 11 09:53:22 2020
Fri Sep 11 10:17:16 2020


In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)
    
    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
    del X_train, X_test, y_train, y_test
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Fri Sep 11 12:17:16 2020 

Accuracy: 0.8610594581695803, se: 0.001453875491295029
RMSE:     0.37272693346224806, se: 0.0019477488447283864
AUC:      0.8521153519758673, se: 0.0017759533977454501


# CHANGING CATEGORICAL VARIABLES

In [2]:
categorical = ['ring_number', 'tripleBond', 'obs_duration_mean', 'doubleBond', 'alone_atom_number', 'oh_count',
               
               'class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
               'application_freq_unit',"exposure_type", "conc1_type"]

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

y = np.append(y_train,y_test)

del X_train, X_test, y_train, y_test

a_ham = 0.008620689655172414
a_pub = 0.008736842105263157
best_k = 1
best_leaf = 10

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X_try, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X_try)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

Basic Matrix... Fri Sep 11 12:33:00 2020
Adding pubchem2d Fri Sep 11 13:12:42 2020
Fri Sep 11 13:40:18 2020


In [3]:
print(ctime())
kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)
    
    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
    del X_train, X_test, y_train, y_test
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Fri Sep 11 13:46:30 2020
Fri Sep 11 15:28:52 2020 

Accuracy: 0.9107873636952324, se: 0.0016185189312741134
RMSE:     0.29863626989204634, se: 0.0026932605579760697
AUC:      0.9045684395646552, se: 0.0014456097816304332


# Moving obs_duration_mean from categorical to non categorical

In [2]:
categorical = ['ring_number', 'tripleBond', 'doubleBond', 'alone_atom_number', 'oh_count',
               
               'class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
               'application_freq_unit',"exposure_type", "conc1_type"]

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP', 'obs_duration_mean']

y = np.append(y_train,y_test)

del X_train, X_test, y_train, y_test

a_ham = 0.008620689655172414
a_pub = 0.008736842105263157
best_k = 1
best_leaf = 10

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X_try, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X_try)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

Basic Matrix... Sun Sep 13 17:35:44 2020
Adding pubchem2d Sun Sep 13 17:38:52 2020
Sun Sep 13 17:52:37 2020


In [12]:
print(ctime())
kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)
    
    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
    del X_train, X_test, y_train, y_test
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Sun Sep 13 17:59:30 2020
Sun Sep 13 18:06:47 2020 

Accuracy: 0.8604262624980361, se: 0.0016865521320462175
RMSE:     0.37356845626223895, se: 0.002255326351751561
AUC:      0.8515386950848172, se: 0.0018082567850603412


# Moving discrete numerical variable to non categorical except for obs_duration_mean

In [2]:
categorical = ['class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
               'application_freq_unit',"exposure_type", "conc1_type", 'obs_duration_mean']

non_categorical =['ring_number', 'tripleBond', 'doubleBond', 'alone_atom_number', 'oh_count',
                  'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

y = np.append(y_train,y_test)

del X_train, X_test, y_train, y_test

a_ham = 0.008620689655172414
a_pub = 0.008736842105263157
best_k = 1
best_leaf = 10

print('Basic Matrix...', ctime())
dist_matr = basic_matrix(X_try, categorical, non_categorical, a_ham)
print('Adding pubchem2d', ctime())
dist_matr += a_pub * pubchem2d_matrix(X_try)
dist_matr = pd.DataFrame(dist_matr)
print(ctime())

Basic Matrix... Sun Sep 13 18:13:40 2020
Adding pubchem2d Sun Sep 13 18:16:33 2020
Sun Sep 13 18:34:21 2020


In [3]:
print(ctime())
kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
rmse = []
auc = []
for train_index, test_index in kf.split(dist_matr):

    X_train = dist_matr.iloc[train_index, train_index]
    X_test = dist_matr.iloc[test_index, train_index]
    y_train = y[train_index]
    y_test = y[test_index]

    neigh = KNeighborsClassifier(metric = 'precomputed',
                                 n_neighbors=best_k, n_jobs=-2,
                                 leaf_size=best_leaf)
    neigh.fit(X_train, y_train.ravel())
    y_pred = neigh.predict(X_test)
    
    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    auc.append(roc_auc_score(y_test, y_pred))
    
    del X_train, X_test, y_train, y_test
    
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_auc = np.mean(auc)
se_auc = sem(auc)

print(ctime(), '\n')
print('''Accuracy: {}, se: {}
RMSE:     {}, se: {}
AUC:      {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_auc, se_auc))

Sun Sep 13 18:34:39 2020
Sun Sep 13 18:40:42 2020 

Accuracy: 0.9105265658811724, se: 0.0016069239189876142
RMSE:     0.29907298001784177, se: 0.00268266388479646
AUC:      0.9041507029090183, se: 0.001539848447870826


# MULTICLASS