In [19]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [20]:
augmented_dir = './data/_augmented_features' # we will get names from the augmented proteins
augmented_proteins = [x[:-4] for x in os.listdir(augmented_dir) if x[-3:] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(augmented_proteins))
print(augmented_proteins[:10])

the number of initial proteins: 121
['24622_2', 'A0A024RAY2_P05783', 'A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9K9Z1', 'E9Q1P8', 'E9Q5G3', 'O08537']


# build amino acid sequence dataset

In [21]:
oglcnac_data = pd.read_csv('./data\oglcnacome_sites.csv', index_col=0) 
print(oglcnac_data.info())
oglcnac_data.sample(10, random_state=SEED)

<class 'pandas.core.frame.DataFrame'>
Index: 4501 entries, 0 to 4500
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   UniprotKB ID   4501 non-null   object 
 1   organism       4501 non-null   object 
 2   oglcnacscore   4501 non-null   float64
 3   oglcnac sites  4501 non-null   object 
 4   sequence       4501 non-null   object 
dtypes: float64(1), object(4)
memory usage: 211.0+ KB
None


Unnamed: 0,UniprotKB ID,organism,oglcnacscore,oglcnac sites,sequence
1634,Q00587,Homo sapiens,12.207032,[132],MPGPQGGRGAATMSLGKLSPVGWVSSSQGKRRLTADMISHPLGDFR...
3687,Q9BQC3,Homo sapiens,27.611586,"[467, 470, 474]",MESMFSSPAEAALQRETGVPGLLTPLPDLDGVYELERVAGFVRDLG...
4244,Q9UIJ7,Homo sapiens,11.665506,[18],MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM...
4005,Q9NQ66,Homo sapiens,6.005927,"[187, 473]",MAGAQPGVHALQLKPVCVSDSLKKGTKFVKWDDDSTIVTPIILRTD...
468,O75152,Homo sapiens,5.077726,"[382, 443, 543, 579]",MPNQGEDCYFFFYSTCTKGDSCPFRHCEAAIGNETVCTLWQEGRCF...
3042,Q8N135,Homo sapiens,12.716548,[337],MGGAGILLLLLAGAGVVVAWRPPKGKCPLRCSCSKDSALCEGSPDL...
1910,Q14247,Homo sapiens,18.858372,"[11, 240, 277, 322, 323, 328, 331, 332, 345, 401]",MWKASAGHAVSIAQDDAGADDWETDPDFVNDVSEKEQRWGAKTVQG...
151,E9Q555,Mus musculus,11.752367,"[1202, 1206]",MECPQCGHVSSEKAPKFCSECGQKLPSAATVQGDLKNDNTLVVSST...
3896,Q9H4A4,Homo sapiens,16.163186,"[60, 247]",MASGEHSPGSGAARRPLHSAQAVDVASASNFRAFELLHLHLDLRAE...
2749,Q80TE4,Mus musculus,20.86847,"[1702, 1704]",MSDPRPSQAEKHKLGRAAAKLKDPSRTMQADDYFARKFKAINGSMG...


In [22]:
from functions import *

In [23]:
'''
augmented protein names include either single element like 'P53621' or multiple elements like 'A0A024RAY2_P05783'
first step is to check if name elements exist in the o-glcnacome database column 'UniprotKB ID' 
if any element exists in the database, then build a dataset of the protein's amino acid sequence with its positive locations
'''

# Convert UniprotKB IDs to a set for faster lookup
database_proteins = set(oglcnac_data['UniprotKB ID'].values)

# Initialize dictionaries and list
positivity_data = {}  # "protein name : sequence with positive sites"
name_augmented_oglcnacome = {}  # names between augmented proteins and database
not_in_database  = []  # proteins not in the database
not_in_secondary = [] # proteins not in the database
duplicated = {} # duplicated proteins

for protein_name in augmented_proteins:
    name_elements = protein_name.split('_')  # Split protein names by '_'
    
    for element in name_elements:
        if element in database_proteins:
            oglcnac_name = element
            break
        else:
            oglcnac_name = None
    
    if oglcnac_name: 
        if oglcnac_name in positivity_data:
            duplicated[protein_name] = oglcnac_name
            
        # Update name matching if protein name has multiple elements
        if len(name_elements) > 1:
            name_augmented_oglcnacome[protein_name] = oglcnac_name
            protein_name = oglcnac_name
            
        if os.path.exists(f"./data/_secondary_structure/dynamine_results/{protein_name}_backbone.pred"):
            # Retrieve sequence data and update dataset
            protein_oglcnac = oglcnac_data[oglcnac_data['UniprotKB ID'] == oglcnac_name]
            positivity_data[protein_name] = sequence_with_positivity(protein_oglcnac)
        else:
            not_in_secondary.append(protein_name)
            
    else:
        not_in_database.append(protein_name)

print(f'total number of matching proteins: {len(augmented_proteins)} -> {len(positivity_data)}\n')

print(f'{len(duplicated)} duplicated proteins')
display(duplicated)

print(f'{len(not_in_database)} proteins not in o-glcnacome database')
print(not_in_database)

print(f'\n{len(not_in_secondary)} proteins not in secondary database')
print(not_in_secondary)

total number of matching proteins: 121 -> 105

5 duplicated proteins


{'P05783': 'P05783',
 'P0CG62_P0CG49': 'P0CG49',
 'P63249_P63248': 'P63248',
 'Q4R561_P60710': 'P60710',
 'Q9WVB1_P35279': 'P35279'}

9 proteins not in o-glcnacome database
['24622_2', 'E9K9Z1', 'O08984', 'P02470', 'P02488', 'P02505', 'P04799', 'P05451', 'P07756']

2 proteins not in secondary database
['P24622', 'P24622']


In [24]:
print('augmented proteins : o-glcnacome database')
display(name_augmented_oglcnacome)

augmented proteins : o-glcnacome database


{'A0A024RAY2_P05783': 'P05783',
 'P0CG62_P0CG49': 'P0CG49',
 'P24622_2': 'P24622',
 'P63249_P63248': 'P63248',
 'P68406_P24622_2': 'P24622',
 'Q4R561_P60710': 'P60710',
 'Q9WVB1_P35279': 'P35279'}

In [25]:
from mauri_feature import *

In [26]:
different_length = [] # difference between length of secondary and flexibility dataset
secondary_data = {}

for protein_name in positivity_data:
    # process flexibility data
    flexibility = pd.read_fwf(f"./data/_secondary_structure/dynamine_results/{protein_name}_backbone.pred", header=None, names=['flexibility']).iloc[11:].reset_index(drop=True)
    flexibility = flexibility['flexibility'].apply(lambda x: x.split()[1]).to_frame().astype({'flexibility':float})
    
    # process secondary structure data
    temp   = pd.read_csv(f"./data/_secondary_structure/spider3_results/{protein_name}.spd33")
    columns = temp.columns[0].split()
    secondary = pd.DataFrame(columns=columns)
    for i, column in enumerate(columns):
        secondary[column] = temp.iloc[:,0].apply(lambda x: x.split()[i])
        if i >= 3:
            secondary[column] = secondary[column].astype('float')

    secondary.index.name = protein_name
    
    if len(flexibility) == len(secondary):
        mauri = secondary[['SEQ']].copy()
        mauri = mauri.rename(columns = {'SEQ' : 'residue'})
        sequence = mauri['residue'].sum()
        
        # make window for easier feature computation
        mauri['window'] = pd.Series([make_window(mauri['residue'].sum(), x) for x in mauri.index])
        # side chain -1 to 5
        for num in range(-1, 6):
            if num != 0:
                mauri[f'side_{num}'] = mauri.window.apply(lambda x: mauri_side(x, num))
        
        # non-polar aliphatic -3 to -1
        mauri['npa(-3,-1)'] = mauri.window.apply(mauri_npa)
        
        # polar positive -7 to -5
        mauri['ppo(-7,-5)'] = mauri.window.apply(mauri_ppo)
        
        # number of S and T -10 to 10
        mauri['s/t'] = mauri.window.apply(mauri_st)
        
        # flexibility
        mauri = pd.concat([mauri, flexibility], axis=1)
        
        # secondary structure
        mauri['ss'] = secondary['SS']
        
        # presence of proline at +1
        mauri['p(1)'] = mauri.window.apply(is_proline)
        
        # secondary structure by phi and psi
        mauri['ss_angle'] = secondary.apply(lambda x: ss_angle(x['Phi'], x['Psi']), axis=1)
        
        # nature of the site: S or T
        mauri = pd.concat([mauri, positivity_data[protein_name][['positivity']]], axis=1)
        
        positivity_data[protein_name] = mauri.drop(['window'], axis=1)
        
    else:
        different_length.append(protein_name)
        
next(iter(positivity_data.values())).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   residue      430 non-null    object 
 1   side_-1      430 non-null    object 
 2   side_1       430 non-null    object 
 3   side_2       430 non-null    object 
 4   side_3       430 non-null    object 
 5   side_4       430 non-null    object 
 6   side_5       430 non-null    object 
 7   npa(-3,-1)   430 non-null    int64  
 8   ppo(-7,-5)   430 non-null    int64  
 9   s/t          430 non-null    int64  
 10  flexibility  430 non-null    float64
 11  ss           430 non-null    object 
 12  p(1)         430 non-null    int64  
 13  ss_angle     430 non-null    object 
 14  positivity   430 non-null    int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 50.5+ KB


## set variables

In [27]:
for_onehot = { # column_name : classes
    # for input variables
    'residue' : ['S', 'T'],
    
    # mauri's basic features
    **{f'side_{num}' : ['very_small', 'small', 'normal', 'long', 'glycine', 'proline', 'aromatic'] for num in range(-1,6) if num != 0},
    'ss' : ['C', 'E', 'H'],
    'ss_angle' : ['alpha', 'other', 'beta'],
    
    # for output variables
    'positivity' : [0, 1]
}

x_cts = ['npa(-3,-1)', 'ppo(-7,-5)', 's/t', 'flexibility', 'p(1)']
x_cat = ['residue'] + \
    [f'side_{num}' for num in range(-1,6) if num != 0] + ['ss', 'ss_angle']
x_var = x_cts + x_cat

y_cts = []
y_cat = ['positivity']
y_var = y_cts + y_cat

## hyper parameter optimization by K-fold cross-validation

In [28]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [29]:
model_type = 'MLP_UP_BASIC'

initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

search_space = {
    'dnn_layers'    : [1, 2, 3, 4, 5],
    'dnn_neurons'   : [32, 64, 128, 256],
    'learning_rate' : [0.0001, 0.001, 0.01]
}

initial parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


### model training with K-fold

In [30]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update  = False

params = initial_params.copy()

MODELs = []
METRICs = []
METRIC_MEAN = []
model_id = 1
verbose = 0
for param_name, space in search_space.items():
    for point in space:
        clear_output(wait=True)
        display(METRIC_MEAN)
        params[param_name] = point
        for key, value in search_space.items():
            print(f'{key:<14} : {value}')
        
        data_x = []
        data_y = []
        for protein_name in positivity_data:
            positivity = positivity_data.get(protein_name)
            ST_idx = np.where((positivity['residue'] == 'S') | (positivity['residue'] == 'T'))[0]
            
            # get X dataset
            x_onehot = get_onehots(positivity[x_var], columns = x_cat, for_onehot = for_onehot)
            x_features = list(x_onehot.columns)
            
            # get Y dataset
            y_onehot = get_onehots(positivity[y_var], columns = y_cat, for_onehot = for_onehot)
            y_labels = list(y_onehot.columns)
            
            for idx in ST_idx:
                window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                window_x = window_x.reshape(-1)
                label_y  = np.array(y_onehot.iloc[idx])
                
                data_x.append(window_x)
                data_y.append(label_y)
                
        data_x = np.array(data_x)
        data_y = np.array(data_y)
        model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network

        print('data x shape:', data_x.shape)
        print('data y shape:', data_y.shape)
        print('class y counts:', data_y.sum(0))
        print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')
        
        splitter = StratifiedShuffleSplit(n_splits = 1, test_size = test_size, random_state = SEED)
        train_idx, test_idx = list(splitter.split(data_x, data_y))[0]
        
        train_x = data_x[train_idx]
        train_y = data_y[train_idx]
        
        test_x = data_x[test_idx]
        test_y = data_y[test_idx]
        
        train_x, test_x = data_scaling(train_x, test_x)
        
        splitter_kf = KFold(n_splits = 5)
        for cv_idx, (train_idx_kf, test_idx_kf) in enumerate(splitter_kf.split(train_x, train_y)):
            train_x_kf, train_y_kf = train_x[train_idx_kf], train_y[train_idx_kf]
            train_x_kf, train_y_kf = upsample_data(train_x_kf, train_y_kf) # up-sample the training dataset
            test_x_kf, test_y_kf = train_x[test_idx_kf], train_y[test_idx_kf]
            
            model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params) 
            model_name = name_model(f'{model_type}_KFOLD', params)
            
            
            model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
            if not os.path.exists(model_folder):
                os.makedirs(model_folder)
            model_path    = f'{model_folder}/{cv_idx}.h5'
            metric_path   = f'{model_folder}/{cv_idx}.csv'
            
            
            if not os.path.exists(model_path) or model_update:
                time_start = time.time()
                history = model.fit(train_x_kf, train_y_kf, verbose=verbose, 
                                    epochs = 10000, callbacks = callbacks,
                                    validation_data = (test_x_kf, test_y_kf))
                time_end = time.time()
                training_time = round((time_end - time_start)/60, 3)
                
                model.save_weights(model_path)
                
                test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                model_metrics = {
                    'model_id' : model_id,
                    'cv_idx'   : cv_idx,
                    **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                    'train_y'     : train_y.shape[-1],
                    'test_size'   : test_x.shape[0],
                    **params,
                    'regularizer_input' : params['regularizer']['input'],
                    'regularizer_hidden' : params['regularizer']['hidden'],
                    'regularizer_bias' : params['regularizer']['bias'],
                    'training_time': training_time,
                    'test_loss': test_loss,
                    'accuracy': accuracy,
                    **{f'precision_{x}': precision[x] for x in range(len(precision))},
                    **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                    **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                
                model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                model_metrics.to_csv(metric_path, index=False)
                
            else:
                model.load_weights(model_path)
                model_metrics = pd.read_csv(metric_path, header=0)
                
            print(f'f1 score: {model_metrics.f1_1[0]:.3f}')
            
            model_metrics['model_id'] = model_id
            METRICs.append(model_metrics)
            MODELs.append(model)
        
        METRIC_MEAN = pd.concat(METRICs).groupby('model_id').mean()
        f1_best = METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0].f1_1
        print(f'best f1 score: {f1_best}')
        model_id += 1
    if param_name in ['learning_rate']: # for float-type parameters
        best_value = float(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"
    else: # for int-type parameters
        best_value = int(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"

clear_output(wait=True)
display(METRIC_MEAN)
for key, value in search_space.items():
    print(f'{key:<14} : {value}')


Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,9486.0,1155.0,2.0,2372.0,1.0,64.0,1.0,64.0,0.001,...,,0.5298,,0.0,96.37,3.63,100.0,100.0,98.15,7.0
2,2.0,9486.0,1155.0,2.0,2372.0,1.0,64.0,2.0,64.0,0.001,...,,0.543,,0.0,96.37,3.63,100.0,100.0,98.15,7.0
3,2.0,9486.0,1155.0,2.0,2372.0,1.0,64.0,3.0,64.0,0.001,...,,0.5632,,0.0,96.37,3.63,100.0,100.0,98.15,7.0
4,2.0,9486.0,1155.0,2.0,2372.0,1.0,64.0,4.0,64.0,0.001,...,,0.601,,0.0,96.37,3.63,100.0,100.0,98.15,7.0


dnn_layers     : [1, 2, 3, 4, 5]
dnn_neurons    : [32, 64, 128, 256]
learning_rate  : [0.0001, 0.001, 0.01]


KeyboardInterrupt: 

In [None]:
print('data x shape: ', data_x.shape)
print('data y shape: ', data_y.shape)
print('train x shape:', train_x.shape)
print('test  x shape:', test_y.shape)

data x shape:  (11858, 420)
data y shape:  (11858, 2)
train x shape: (9486, 420)
test  x shape: (2372, 2)


In [None]:
print('optimal parameters')
for key, value in params.items():
    print(f'{key:<14} : {value}')

optimal parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 1
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.0001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## evaluate model's general performance through Monte-Carlo cross-validation

In [None]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update = False
            
MODELs = []
METRICs = []
METRIC_MEAN = []
verbose = 0
        
data_x = []
data_y = []

for protein_name in positivity_data:
    positivity = positivity_data.get(protein_name)
    ST_idx = np.where((positivity['residue'] == 'S') | (positivity['residue'] == 'T'))[0]
    
    # get X dataset
    x_onehot = get_onehots(positivity[x_var], columns = x_cat, for_onehot = for_onehot)
    x_features = list(x_onehot.columns)
    
    # get Y dataset
    y_onehot = get_onehots(positivity[y_var], columns = y_cat, for_onehot = for_onehot)
    y_labels = list(y_onehot.columns)
    
    for idx in ST_idx:
        window_x = np.array(get_window(x_onehot, idx, params['window_size']))
        window_x = window_x.reshape(-1)
        label_y  = np.array(y_onehot.iloc[idx])
        
        data_x.append(window_x)
        data_y.append(label_y)
        
data_x = np.array(data_x)
data_y = np.array(data_y)

print('data x shape:', data_x.shape)
print('data y shape:', data_y.shape)
print('class y counts:', data_y.sum(0))
print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')

splitter = StratifiedShuffleSplit(n_splits = 5, test_size = test_size, random_state = SEED)
for cv_idx, (train_idx, test_idx) in enumerate(splitter.split(data_x, data_y)):
    train_x, train_y = data_x[train_idx], data_y[train_idx]
    train_x, train_y = upsample_data(train_x, train_y) # up-sample the training dataset
    test_x , test_y  = data_x[test_idx],  data_y[test_idx]
    
    train_x, test_x = data_scaling(train_x, test_x)
    
    model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params)
    model_name = name_model(f'{model_type}', params)
    
    
    model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    model_path    = f'{model_folder}/{cv_idx}.h5'
    metric_path   = f'{model_folder}/{cv_idx}.csv'
    
    
    if not os.path.exists(model_path) or model_update:
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=verbose, 
                            epochs = 10000, callbacks = callbacks,
                            validation_split = test_size/(1-test_size))
        time_end = time.time()
        training_time = round((time_end - time_start)/60, 3)
        
        model.save_weights(model_path)
        
        test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
        model_metrics = {
            'model_id' : model_id,
            'cv_idx'   : cv_idx,
            **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
            'train_y'     : train_y.shape[-1],
            'test_size'   : test_x.shape[0],
            **params,
            'regularizer_input' : params['regularizer']['input'],
            'regularizer_hidden' : params['regularizer']['hidden'],
            'regularizer_bias' : params['regularizer']['bias'],
            'training_time': training_time,
            'test_loss': test_loss,
            'accuracy': accuracy,
            **{f'precision_{x}': precision[x] for x in range(len(precision))},
            **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
            **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
        model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
        model_metrics.to_csv(metric_path, index=False)
        
    else:
        model.load_weights(model_path)
        model_metrics = pd.read_csv(metric_path, header=0)
        
    print(f'f1 score: {model_metrics.f1_1[0]:.3f}')
    
    METRICs.append(model_metrics)
    MODELs.append(model)

METRICs = pd.concat(METRICs)
METRIC_MEAN = METRICs.groupby('model_id').mean()
METRIC_STD = METRICs.groupby('model_id').std()

data x shape: (11858, 420)
data y shape: (11858, 2)
class y counts: [11429   429]
class y ratio: [0.9638 0.0362]
f1 score: 11.570
f1 score: 13.080
f1 score: 12.960
f1 score: 11.710
f1 score: 13.910


In [None]:
METRICs

Unnamed: 0,model_id,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
0,13,0,18286,420,2,2372,1,64,1,64,...,,0.781,0.527061,75.51,97.33,6.65,76.68,44.19,85.78,11.57
0,13,1,18286,420,2,2372,1,64,1,64,...,,0.777,0.550626,73.1,97.8,7.41,73.75,55.81,84.09,13.08
0,13,2,18286,420,2,2372,1,64,1,64,...,,2.119,0.570571,73.95,97.71,7.37,74.72,53.49,84.68,12.96
0,13,3,18286,420,2,2372,1,64,1,64,...,,2.283,0.568303,76.48,97.32,6.78,77.73,43.02,86.43,11.71
0,13,4,18286,420,2,2372,1,64,1,64,...,,3.123,0.551692,74.96,97.85,7.95,75.68,55.81,85.35,13.91


In [None]:
METRIC_MEAN

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,2.0,18286.0,420.0,2.0,2372.0,1.0,64.0,1.0,64.0,0.0001,...,,1.8166,0.55365,74.8,97.602,7.232,75.712,50.464,85.266,12.646


In [None]:
METRIC_STD

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,1.581139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.020893,0.017471,1.319526,0.257818,0.526612,1.568748,6.346107,0.915494,0.989763
