In [1]:
from sequence_distance_matrix import *
from import_data_encoded_knn import *
from sklearn.metrics import log_loss, recall_score, roc_auc_score, precision_score, accuracy_score, mean_squared_error
from math import sqrt
from time import ctime

X_try, X_train, X_test, y_train, y_test, len_X_train = import_data_encoded(encoding = 'binary')

categorical = ['ring_number', "exposure_type", "conc1_type","species",'tripleBond', 'obs_duration_mean', 'doubleBond',
    'alone_atom_number', 'class', 'tax_order', 'family', 'genus', 'oh_count']

non_categorical =[ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

In [2]:
def cv_params_logloss(X,y, categorical, non_categorical, sequence_ham = [],
                  ks = [1], leaf_size = range(60, 91, 10)):

    np.random.seed(123)
    print(ctime())
    print('START...')
    best_accuracy = 0
    best_alpha = 0
    best_k = 0
    best_leaf = 0
    best_log_loss = 100
    
    print('Computing Euclidean ...')
    basic_mat = euclidean_matrix(X, non_categorical)

    for ah in sequence_ham:
        print('\n', ctime())
        print('Adding Hamming 1 (Categorical)... alpha = {}'.format(ah))
        dist_matr = ah * hamming_matrix(X, categorical)
        dist_matr += basic_mat
        dist_matr = pd.DataFrame(dist_matr)
        print(ctime())
        print('Start CV...')
        for k in ks:
            for ls in leaf_size:

                kf = KFold(n_splits=5, shuffle=True)
                accs = []
                rmse = []
                lls = []
                for train_index, test_index in kf.split(dist_matr):

                    X_train = dist_matr.iloc[train_index, train_index]
                    X_test = dist_matr.iloc[test_index, train_index]
                    y_train = y[train_index]
                    y_test = y[test_index]

                    neigh = KNeighborsClassifier(metric = 'precomputed',
                                                 n_neighbors=k, n_jobs=-2,
                                                 leaf_size=ls)
                    neigh.fit(X_train, y_train.ravel())
                    y_pred = neigh.predict(X_test)

                    accs.append(accuracy_score(y_test, y_pred))
                    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
                    lls.append(log_loss(y_test, neigh.predict_proba(X_test)))
                    
                    
                avg_lls = np.mean(lls)
                
                avg_acc = np.mean(accs)
                se_acc = sem(accs)

                avg_rmse = np.mean(rmse)
                se_rmse = sem(rmse)
                if (avg_lls < best_log_loss):
                    print('''New best params found! alpha:{}, k:{}, leaf:{},
                                                    log_loss:{}, 
                                                    acc:  {}, st.error:  {},
                                                    rmse: {}, st.error:  {}'''.format(ah, k, ls, avg_lls,
                                                                                    avg_acc, se_acc,
                                                                                    avg_rmse, se_rmse))
                    best_alpha = ah
                    best_k = k
                    best_accuracy = avg_acc
                    best_leaf = ls
                    best_log_loss = avg_lls
    
    return best_log_loss, best_accuracy, best_alpha, best_k, best_leaf

In [3]:
ham = np.logspace(-3, 0, 30) 

best_ll, best_acc, best_alpha, best_k, best_leaf = cv_params_logloss(X_train, y_train, categorical, non_categorical,
                                                    sequence_ham = ham)

Thu Aug 13 17:19:44 2020
START...
Computing Euclidean ...

 Thu Aug 13 17:19:50 2020
Adding Hamming 1 (Categorical)... alpha = 0.001
Thu Aug 13 17:20:01 2020
Start CV...
New best params found! alpha:0.001, k:1, leaf:60,
                                                    log_loss:3.722451164737895, 
                                                    acc:  0.892224, st.error:  0.0028377343075066145,
                                                    rmse: 0.3281785813768681, st.error:  0.004324890908055959
New best params found! alpha:0.001, k:1, leaf:70,
                                                    log_loss:3.592032745070712, 
                                                    acc:  0.8960000000000001, st.error:  0.0029899030084603053,
                                                    rmse: 0.3223556085199903, st.error:  0.004659980035010051
New best params found! alpha:0.001, k:1, leaf:90,
                                                    log_loss:3.5765593732457917, 
  