In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from Pfeature.pfeature import *

In [5]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
               'R', 'S', 'T', 'V', 'W', 'Y']
# initialise empty lists for collecting feature arrays during composition feature generation
A = []
C = []
D = []
E = []
F = []
G = []
H = []
I = []
K = []
L = []
M = []
N = []
P = []
Q = []
R = []
S = []
T = []
V = []
W = []
Y = []

train_data_path = 'train.csv'
test_data_path='test.csv'
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# extract the amino acid residue sequences from the training data
train_data_seq = train_data.iloc[:, 1]

# extract the class labels (0,1) corresponding to the amino acid residue sequences from the training data
train_data_labels = train_data.iloc[:, 0]

# extract the amino acid residue sequences from the testing data
test_data_seq = test_data.iloc[:, 1]

In [14]:
def feature_engineering(dataset):
    ''' Responsible for feature generation (engineering) of a dataset containing peptide sequences.
    Amino acid composition of a sequence is being used in generating features for that sequence.
    Input: A dataframe containing peptide sequences.
    Returns: A dataframe contaning the new features (Amino acid composition) corresponding the input dataset
    '''
    dpc_wp(dataset, 'a.csv',1)
    data = pd.read_csv('a.csv')
    data.drop(0, inplace=True)
    return data


In [15]:
#feature_engineering(test_data_path)

Unnamed: 0,DPC1_AA,DPC1_AC,DPC1_AD,DPC1_AE,DPC1_AF,DPC1_AG,DPC1_AH,DPC1_AI,DPC1_AK,DPC1_AL,...,DPC1_YM,DPC1_YN,DPC1_YP,DPC1_YQ,DPC1_YR,DPC1_YS,DPC1_YT,DPC1_YV,DPC1_YW,DPC1_YY
1,0.54,0.54,0.27,0.54,0.00,0.27,0.00,0.00,1.35,0.54,...,0.27,0.27,0.00,0.00,0.27,0.54,0.54,0.27,0.00,0.54
2,0.00,0.00,0.00,0.00,0.26,0.00,0.00,0.53,0.00,0.26,...,0.00,0.00,0.00,0.26,0.00,0.26,0.00,0.26,0.00,0.00
3,2.66,0.00,0.66,0.33,0.66,0.66,0.33,0.66,0.00,1.66,...,0.00,0.00,0.00,0.00,0.66,0.00,0.00,0.00,0.33,0.00
4,0.71,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.35,0.35,...,0.00,0.00,0.71,0.35,0.00,0.71,0.35,0.35,0.00,0.71
5,0.00,0.00,0.00,0.66,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,1.32,0.00,0.00,0.00,0.66,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.36,0.72,...,0.00,1.08,0.00,0.36,0.36,0.72,0.36,0.36,0.00,0.36
1635,0.85,0.00,0.57,0.28,0.00,0.85,0.00,0.28,0.85,1.14,...,0.00,0.57,0.28,0.00,0.00,0.00,0.00,0.00,0.00,0.28
1636,0.17,0.52,0.69,1.55,0.17,0.17,0.17,0.35,1.04,0.35,...,0.00,0.17,0.00,0.00,0.00,0.17,0.00,0.00,0.00,0.00
1637,0.39,0.00,0.26,1.03,0.00,0.13,0.00,0.64,0.39,0.26,...,0.13,0.51,0.00,0.26,0.51,0.13,0.26,0.26,0.13,0.13


In [16]:
def machine_learning_model():
    ''' Uses Stacking Classifier to stack the Random Forest Classifier with the Logistic Regression Classifier.
    Logistic Regression Classifier is used to combine the Random Forest base estimator.
    Return: the model variable.
    '''
    level0 = list()  # base estimators
    level0.append(('lr', LogisticRegression()))
    level0.append(('rf', RandomForestClassifier(n_estimators=700,
                                                oob_score="True", n_jobs=-1, max_features="sqrt")))
    # classifier which will be used to combine the base estimators.
    level1 = LogisticRegression()
    # default 5-fold cross validation
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model


def evaluate_and_fit_model(train_features, train_labels):
    ''' Gets the machine learning model and preforms kfold cross validation on the model with accuracy as scoring criteria. 
    Displays the cross validation results. Then fits the model on the entire training data so that it can be used to make 
    predictions on test dataset.
    Input: the training amino acid composition features and training labels as separate dataframes.
    Return: the final refitted model
    '''
    model = machine_learning_model()  # gets the machine learning model variable
    # stratified 10-Fold cross validation repeated 5 times with different randomization in each repetition.
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, train_features, train_labels,
                             scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print("\nAccuracy scores for the model from k-fold cross validation: \n", scores)
    print('\nMean accuracy score: %.3f' % (mean(scores)))
    # print('\nStandard deviation of accuracy score: %.3f' % (std(scores)))
    model.fit(train_features, train_labels)
    return model

In [17]:
def main():
    global train_data_labels, train_data_seq, test_data_seq

    # Amino acid composition of the sequences in training dataset (feature generation)
    train_data_comp = feature_engineering(train_data_path)

    # Amino acid composition of the sequences in testing dataset (feature generation)
    test_data_comp = feature_engineering(test_data_path)

    model = evaluate_and_fit_model(train_data_comp, train_data_labels)
    predictions = model.predict_proba(test_data_comp)

    # create the final predictions dataframe and export it as a csv.
    finalpredictions = pd.DataFrame(
        {'ID': np.array(test_data.iloc[:, 0]), 'Label': predictions[:, 0]})
    finalpredictions.to_csv('Group_18_Predictions_Output.csv', index=False)
