In [1]:
import biovec
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [2]:
# TASK 1 - Generating corpus file from fasta file and train word2vec
'''model = biovec.ProtVec(corpus_fname="uniprot_sprot.fasta", 
                    n=3,
                    size=100,
                    window=5,
                    sg=1,
                    min_count=2)
    
model.save('./saved_model')'''

'model = biovec.ProtVec(corpus_fname="uniprot_sprot.fasta", \n                    n=3,\n                    size=100,\n                    window=5,\n                    sg=1,\n                    min_count=2)\n    \nmodel.save(\'./saved_model\')'

In [3]:
# Load saved model
pv = biovec.models.load_protvec('./saved_model')

In [5]:
# Prepare dataset for prediction
protein_names, sequences, labels = [], [], []


'''
    Labels:
        "+" stands for "binding protein" => 1
        "-" stands for "non-binding" => 0
'''
def convert_label(label_string):
 
    if label_string == "+":
        return 1
    elif label_string == "-":
        return 0
    else:
        return None

    
# Open file containing dataset    
with open('./ppi_data.fasta') as f:
    lines = f.read().splitlines()
    
    for i in range(len(lines)):
        
        if i % 3 == 0:
            protein_names.append(lines[i])
        elif i % 3 == 1:
            sequences.append(lines[i])
        elif i % 3 == 2:
            labels.append([convert_label(letter) for letter in lines[i]])
            
protein_names = np.array(protein_names)
sequences = np.array(sequences)
labels = np.array(labels)

assert(protein_names.shape[0] == sequences.shape[0] == labels.shape[0])

print(protein_names[0])
print(sequences[0])
print(labels[0])

>P0A8Q6
MGKTNDWLDFDQLAEEKVRDALKPPSMYKVILVNDDYTPMEFVIDVLQKFFSYDVERATQLMLAVHYQGKAICGVFTAEVAETKVAMVNKYARENEHPLLCTLEKA
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


### Example for sequence   M(VWL|N|GEP)RPLEG ...
    - Window size 7
    - Center letter is N
    - Vec(VWLNGEP) = Vec(VWL) + Vec(WLN) + Vec(LNG) + Vec(NGE) + Vec(GEP)

In [1]:
window_size = 7
vector_size = 100

X, y = [], []

# Iterates over all proteins in dataset
for i in range(len(sequences)):

    # Loop over sequence
    for j in range(0,len(sequences[i]) - window_size + 1):
        sub_sequence = sequences[i][j:j+window_size]

        # Sum vectors of 3 grams
        tmp = np.zeros(vector_size)
        for k in range(0, window_size - 2):
            tmp = tmp + np.array(pv[sub_sequence[k:k+3]])
    
        X.append(tmp)
        y.append(labels[i][j+3])    
        
X, y = np.array(X), np.array(y)

print(X.shape)
print(y.shape)

NameError: name 'sequences' is not defined

In [7]:
# Cross validation in 10-Folds
skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
skf.get_n_splits(X, y)


hyperparameters = {
    'hidden_layer_sizes': [(25), (50), (100)],
    'learning_rate_init': [0.01, 0.001],
    'max_iter': [100, 200, 500]
}

accuracy, precision, recall, auc = [], [], [], []

for train_index, test_index in skf.split(X, y):
    print("TRAIN indeces:", train_index.shape, "TEST indeces:", test_index.shape)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    estimator = GridSearchCV(
        MLPClassifier(random_state=42),
        param_grid=hyperparameters, 
        n_jobs=-1,
        verbose=0)
    
    # Training
    estimator.fit(X_train, y_train)
    print(estimator.best_params_)
    
    # Predict on test data
    #predictions = estimator.predict(X_test)
    predictions = estimator.best_estimator_.predict(X_test)

    # Scores
    accuracy.append(accuracy_score(y_test, predictions))
    precision.append(precision_score(y_test, predictions))
    recall.append(recall_score(y_test, predictions, average='micro'))
    auc.append(roc_auc_score(y_test, predictions))
    
    print(accuracy)
    
    
print(np.mean(np.array(accuracy)))
print(np.mean(np.array(precision)))
print(np.mean(np.array(recall)))
print(np.mean(np.array(auc)))
    

TRAIN indeces: (80586,) TEST indeces: (8955,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768]
TRAIN indeces: (80586,) TEST indeces: (8955,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206]
TRAIN indeces: (80586,) TEST indeces: (8955,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238]
TRAIN indeces: (80586,) TEST indeces: (8955,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005]
TRAIN indeces: (80587,) TEST indeces: (8954,)




{'hidden_layer_sizes': 50, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005, 0.7204601295510387]
TRAIN indeces: (80587,) TEST indeces: (8954,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005, 0.7204601295510387, 0.7216886307795398]
TRAIN indeces: (80587,) TEST indeces: (8954,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005, 0.7204601295510387, 0.7216886307795398, 0.7225820862184499]
TRAIN indeces: (80588,) TEST indeces: (8953,)




{'hidden_layer_sizes': 50, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005, 0.7204601295510387, 0.7216886307795398, 0.7225820862184499, 0.7195353512789009]
TRAIN indeces: (80588,) TEST indeces: (8953,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005, 0.7204601295510387, 0.7216886307795398, 0.7225820862184499, 0.7195353512789009, 0.7195353512789009]
TRAIN indeces: (80588,) TEST indeces: (8953,)




{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 100}
[0.7206030150753768, 0.7221663874930206, 0.7222780569514238, 0.721608040201005, 0.7204601295510387, 0.7216886307795398, 0.7225820862184499, 0.7195353512789009, 0.7195353512789009, 0.7173014631966939]
0.720775851202435
0.5111729424570868
0.720775851202435
0.510207754657122
