In [4]:
def parse_fasta_to_df(file_path, content_alias, id_alias = "ProtID"):
    from Bio import SeqIO
    import pandas as pd
    with open(file_path) as fasta_file:
        ids = []
        contents = []
        for record in SeqIO.parse(fasta_file, 'fasta'):
            ids.append(record.id)
            contents.append(record.seq)
        df = pd.DataFrame(data = {id_alias: ids, content_alias: contents})
        return df.set_index(id_alias)
    
def filter_unique_rownames(df):
    return df[~df.index.duplicated(keep='first')]


def import_protvec(filepath, namescol = "words"):
    """
    Import data frame of ProtVec 3-grams. 
    
    :param filepath: path to a TSV.
    :param namescol: name of a column with row names 
    :return: pandas dataframe with 3-grams as rownames.
    """
    import pandas as pd
    protvec_df = pd.read_csv(filepath, sep = "\t", header = 0)
    protvec_df_3gramidx = protvec_df.set_index(namescol)
    return(protvec_df_3gramidx)

def get3gramvec(threegr_df, threegr_name, as_list = False):
    if not threegr_name in threegr_df.index:
        raise ValueError(''.join(["The supplied ProtVec dataset is not trained for the threegram: ", threegr_name]))
    vec = threegr_df.loc[threegr_name].values
    if (as_list):
        vec = vec.tolist()
    return vec
    
def convert_seq_to_protvec(seq, threegr_df, substitute_any_with="G"):
    """
    Get ProtVec representation of a given sequence
    """
    import numpy as np
    protvec = np.zeros(100)
    for i in range(0, len(seq) - 3):
        this3gram = str(seq[i:i+3])
        this3gram = this3gram.replace("X", substitute_any_with)
        if not this3gram in threegr_df.index:
            # skip untrained 3grams
            continue
        this3gramvec = get3gramvec(threegr_df, this3gram)

        protvec = np.add(protvec, this3gramvec)
    # Exploratory DATA Analysis (EDA)
    # amino acid sequence length
    protvec = np.append(protvec, get_sequence_len(seq))
    # add amino acid frequency
    
    # avg distance between amino
    
    
    return protvec

def get_sequence_len(seq):
    count = 0
    for char in seq:
        count+=1
    return count

def calculate_avg_score(scores):
    sum = 0
    for value in scores:
        sum += value
    avg_sum = sum / len(scores)
    return avg_sum

In [5]:
import pandas as pd
import numpy as np
import os

# get current dir/filename
import inspect
this_file_path = os.path.abspath(inspect.getframeinfo(inspect.currentframe()).filename)
this_dir = os.path.dirname(this_file_path)

class_path = os.path.join(this_dir, "data", "PP_step1_trn.class")
seq_path = os.path.join(this_dir, "data", "PP_step1_trn.fas")
protvec_file = os.path.join(this_dir, "data", "protVec_100d_3grams.csv")

# parse information
class_df = filter_unique_rownames(parse_fasta_to_df(class_path,"Binds"))
seq_df = filter_unique_rownames(parse_fasta_to_df(seq_path,"Sequence"))

# Convert levels to numeric
class_df['Binds'] = class_df['Binds'].map({"Bind" : 1, "Non_Bind" : 0})


# Get Bind/non-bind ratio
counts = class_df['Binds'].value_counts()
ratio = counts[0]/counts[1] # 1: 536, # 0: 369
print("Bind/non-bind ratio is", str(ratio))
print(class_df['Binds'].value_counts())

Bind/non-bind ratio is 0.6884328358208955
1    536
0    369
Name: Binds, dtype: int64




In [6]:
# Convert sequences to their ProtVec representation
pv_df = import_protvec(protvec_file)
seq_df['Sequence'] = seq_df['Sequence'].apply(convert_seq_to_protvec, args = (pv_df,))

# Merge two dataframes into one - REDUNDANT
protvec_and_target_df = pd.merge(class_df, seq_df, left_index=True, right_index=True)


In [7]:
# TODO (?): access individual columns from the df
protvec_columns_df = pd.DataFrame(protvec_and_target_df['Sequence'].values.tolist())

In [8]:
# Merge two dataframes into one
protvec_and_target_df = pd.merge(class_df, seq_df, left_index=True, right_index=True)

In [9]:
# Eject features and targets, make CV splits
from sklearn.model_selection import StratifiedKFold
N_SPLITS_CV = 5

#features = np.stack(protvec_and_target_df['Sequence'].values.tolist(), axis = 0)
features = np.stack(protvec_and_target_df['Sequence'].values.tolist(), axis = 0)
targets = protvec_and_target_df['Binds'].values # == labels

skf = StratifiedKFold(n_splits=N_SPLITS_CV)
folds = skf.split(features, targets)

for train_index, test_index in folds:
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]# test layer

In [18]:
##### Fit ANN # http://scikit-learn.org/stable/modules/neural_networks_supervised.html
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

X = X_train
y = y_train


clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6, 279), random_state=1)
#clf = MLPClassifier(warm_start= True, verbose= True, tol= 0.001, solver= 'lbfgs', 
#                    random_state = 6, max_iter = 100, 
#                    learning_rate_init = 0.83854545454545459, learning_rate = 'invscaling',
#                    hidden_layer_sizes = 24001, batch_size = 'auto', 
#                    alpha = 0.28283545454545456, activation = 'logistic')
clf.fit(X, y)    

# Predict sample
clf.predict(X_test)

scores = cross_val_score(clf, features, targets, cv = skf)
print(scores)
score_sum = 0
for score in scores:
    score_sum += score
score_avg = score_sum / len(scores)
print(score_avg)

[0.64285714 0.63535912 0.63535912 0.67955801 0.68333333]
0.6552933438568799


In [8]:
print("Training features shape: ", X_train.shape)
print("Testing features shape: ", X_test.shape)

Training features shape:  (725, 101)
Testing features shape:  (180, 101)


In [9]:
import numpy as np


# hidden layers: generates list of n_max tuples with 
# length: n_l_min--n_l_max integers, value: each between n_a_min and n_a_max
# https://www.kaggle.com/jilkoval/titanic-with-random-forest-and-neural-networks
def rand_hidden_layer_sizes(n_l_min,n_l_max,n_a_min,n_a_max,n_max=1000):
    n_l = np.random.randint(n_l_min,n_l_max,n_max)
    list_hl = []
    for nl_i in n_l:
        list_hl.append(tuple(np.random.randint(n_a_min,n_a_max,nl_i)))
    return list_hl

hidden_layers = rand_hidden_layer_sizes(1,3,1,10000,500)

In [113]:
# defining parameter space to search
# solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6, 279), random_state=1
parameter_space = {
'hidden_layer_sizes': list(range(5000, 30000)) # https://stackoverflow.com/a/52029734
,'activation' : ['relu', 'identity', 'logistic', 'tanh'] #'
,'solver': ['lbfgs'] # lbfgs performs well on small datasets
,'alpha': list(np.linspace(0.00001, 1, 100))
,'batch_size' : ['auto'] # not used when solver is lbfgs
,'learning_rate' : ['constant','invscaling','adaptive'] # only with solver l
,'learning_rate_init' : list(np.linspace(0.001, 1, 100))
,'max_iter' : [50, 100, 150, 300, 600]
,'random_state' : [6] # sets the seed
,'warm_start' : [False, True]
,'tol' : [0.0001, 0.001, 0.01, 0.1, 1]
,'verbose' : [True] # prints progress to stdout
}




# parameters not used
# ,'power_t' : [0.5, ] # only when solver is SGD
# #'early_stopping' : [False], # only with SGD or ADAM
#'beta_1' : [0.9], # not with LBFGS
#'beta_2' : [0.999], # not with LBFGS
#'epsilon' : [1e-8] # not with LBFGS
#,'n_iter_no_change' : [10] # not with LBFGS
#'nesterovs_momentum' : [True], # only with SGD
#'momentum' : [0.9], # only with SGD
#'shuffle' : [True], # only with SGD or ADAM
#'validation_fraction' : [0.1] # not with LBFGS

In [114]:
# first randomized search to find roughly the best attributes
# choosing classifier
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=150) # max_iter will be constant

# running the search
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(mlp, parameter_space, cv=3, scoring='accuracy')
random_search.fit(X, y)


# Best paramete set
print('Best parameters found:\n', random_search.best_params_)

# All results
means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, random_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
best_model = random_search.best_estimator_

Best parameters found:
 {'warm_start': True, 'verbose': True, 'tol': 0.001, 'solver': 'lbfgs', 'random_state': 6, 'max_iter': 100, 'learning_rate_init': 0.83854545454545459, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 24001, 'batch_size': 'auto', 'alpha': 0.28283545454545456, 'activation': 'logistic'}
0.680 (+/-0.016) for {'warm_start': True, 'verbose': True, 'tol': 0.001, 'solver': 'lbfgs', 'random_state': 6, 'max_iter': 50, 'learning_rate_init': 0.30372727272727273, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 20395, 'batch_size': 'auto', 'alpha': 0.16162454545454547, 'activation': 'logistic'}
0.499 (+/-0.046) for {'warm_start': True, 'verbose': True, 'tol': 0.1, 'solver': 'lbfgs', 'random_state': 6, 'max_iter': 300, 'learning_rate_init': 0.27345454545454545, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 14399, 'batch_size': 'auto', 'alpha': 0.19192727272727272, 'activation': 'logistic'}
0.647 (+/-0.017) for {'warm_start': False, 'verbose': True, 'tol': 0.0001, 's

In [12]:
# best scores so far with RandomizedSearchCV, without EDA:
# 0.687 (+/-0.024) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 50, 'learning_rate_init': 0.80827272727272736, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 8585, 'batch_size': 'auto', 'alpha': 0.19192727272727272, 'activation': 'logistic'}
# 0.683 (+/-0.028) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 50, 'learning_rate_init': 0.8183636363636364, 'learning_rate': 'constant', 'hidden_layer_sizes': 9900, 'batch_size': 'auto', 'alpha': 0.92929363636363627, 'activation': 'logistic'}
# 0.672 (+/-0.031) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 150, 'learning_rate_init': 0.17254545454545456, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 7369, 'batch_size': 'auto', 'alpha': 0.29293636363636366, 'activation': 'logistic'}
# 0.665 (+/-0.034) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 50, 'learning_rate_init': 0.27345454545454545, 'learning_rate': 'constant', 'hidden_layer_sizes': 1800, 'batch_size': 'auto', 'alpha': 0.001, 'activation': 'logistic'}
# 0.663 (+/-0.024) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 150, 'learning_rate_init': 0.63672727272727281, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (4115,), 'batch_size': 'auto', 'alpha': 1e-05, 'activation': 'logistic'}
# 0.652 (+/-0.090) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 100, 'learning_rate_init': 0.16245454545454546, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (17, 16, 14, 5, 19, 14, 15, 13, 14, 9, 11, 11), 'batch_size': 'auto', 'alpha': 1e-05, 'activation': 'identity'}
# 0.650 (+/-0.036) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 50, 'learning_rate_init': 0.33400000000000002, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (97, 14, 30, 59), 'batch_size': 'auto', 'alpha': 0.0001, 'activation': 'identity'}
# 0.650 (+/-0.025) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 6, 'power_t': 0.5, 'max_iter': 150, 'learning_rate_init': 0.62663636363636366, 'learning_rate': 'invscaling', 'hidden_layer_sizes': (27, 58, 30), 'batch_size': 'auto', 'alpha': 0.0001, 'activation': 'relu'}
# 0.646 (+/-0.022) for {'verbose': True, 'solver': 'lbfgs', 'random_state': 4, 'power_t': 0.5, 'max_iter': 50, 'learning_rate_init': 0.7477272727272728, 'learning_rate': 'adaptive', 'hidden_layer_sizes': 700, 'batch_size': 'auto', 'alpha': 0.001, 'activation': 'relu'}

# parameters with EDA length
# 0.702 (+/-0.041) for {'warm_start': True, 'verbose': True, 'tol': 0.001, 'solver': 'lbfgs', 'random_state': 6, 'max_iter': 100, 'learning_rate_init': 0.83854545454545459, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 24001, 'batch_size': 'auto', 'alpha': 0.28283545454545456, 'activation': 'logistic'}

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

X = X_train
y = y_train


clf = MLPClassifier(warm_start= True, verbose= True, tol= 0.001, solver= 'lbfgs', 
                    random_state = 6, max_iter = 100, 
                    learning_rate_init = 0.83854545454545459, learning_rate = 'invscaling',
                    hidden_layer_sizes = 24001, batch_size = 'auto', 
                    alpha = 0.28283545454545456, activation = 'logistic')
clf.fit(X, y)    

# Predict sample
clf.predict(X_test)

scores = cross_val_score(clf, features, targets, cv = skf)
print(scores)

KeyboardInterrupt: 

In [15]:
# for GridSearchCV
parameter_space = { 
    'verbose' : [True], 
    'solver' : ['lbfgs'],
    'random_state' : [4,6], 
    'power_t' : [0.5], 
    'max_iter' : [50, 100, 150], 
    'learning_rate_init' :  [0.83854545454545459, 0.7477272727272728, 0.62663636363636366, 0.33400000000000002, 
                         0.16245454545454546, 0.63672727272727281, 0.27345454545454545, 0.17254545454545456, 
                         0.8183636363636364, 0.80827272727272736], 
    'learning_rate' : ['adaptive', 'constant', 'invscaling'], 
    'hidden_layer_sizes' : [8585, 9900, 7369, 1800, 4415, (17, 16, 14, 5, 19, 14, 15, 13, 14, 9, 11, 11), 
                                           (97, 14, 30, 59), (27, 58, 30), 700, 24001], 
    'batch_size' : ['auto'], 
    'alpha' : [0.19192727272727272, 0.92929363636363627, 0.29293636363636366, 0.001,
                                                             1e-05, 1e-05, 0.0001, 0.28283545454545456], 
    'activation' : ['logistic', 'identity', 'relu']}

In [None]:
# performing GridSearchCV with the best results that I got from RandomizedSearchCV
# choosing classifier
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=3)

# running the search
from sklearn.model_selection import GridSearchCV



random_search = GridSearchCV(mlp, parameter_space, cv=3, scoring='accuracy')
random_search.fit(X, y)

# 
# Best paramete set
print('Best parameters found:\n', random_search.best_params_)

# All results
means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, random_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
# performing GridSearchCV in the parameter spaces close the best parameters so far