In [1]:
# Testing Machine learning alogrithm
# User inputs protein sequence to assess if it is an epitope (1), or non-epitope (0)

In [4]:
import pandas as pd
import numpy as np
import joblib# Load heuristic CSV
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [5]:
heuristics_df = pd.read_csv('peptide_features.csv', index_col=0)  # index = amino_acid

In [6]:
heuristics_df.head()

Unnamed: 0_level_0,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point
amino_acid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0.66,0.49,1.064,2.1,6.11
C,1.19,0.26,1.412,1.4,5.15
D,1.46,0.81,0.866,10.0,2.98
E,0.74,0.84,0.851,7.8,3.08
F,0.6,0.42,1.091,-9.2,5.76


In [8]:
# Load ML models
# model_logreg = joblib.load('model_logreg.pkl')
model_dt = joblib.load('model_dt.pkl')
model_rf = joblib.load('model_rf.pkl')

In [15]:
# Validate sequence
def validate_sequence(seq):
    seq = seq.upper()
    valid_amino_acid = set('ACDEFGHIKLMNPQRSTVWY') # this ensures that user inputted sequence has any of the following characters
    if not set(seq).issubset(valid_amino_acid):
        raise ValueError("Sequence contains invalid amino acids")
    return seq

In [16]:
# Compute heuristic averages
def compute_heuristics(seq):
    values = {}
    for feature in heuristics_df.columns:
        vals = [heuristics_df.at[amino_acid, feature] for amino_acid in seq]
        values[feature] = np.mean(vals)
    return values

In [17]:
# Compute biopython features
def compute_biopython_features(seq):
    prot = ProteinAnalysis(seq)
    features = {
        'isoelectric_point': prot.isoelectric_point(),
        'aromaticity': prot.aromaticity(),
        'hydrophobicity': prot.gravy(),  # Kyte-Doolittle average
        'stability': prot.instability_index()
    }
    return features

In [18]:
# Main prediction function
def predict_epitope(seq):
    seq = validate_sequence(seq)
    data = {}
    data.update(compute_heuristics(seq))
    data.update(compute_biopython_features(seq))
    X = pd.DataFrame([data])  # single-row dataframe
    # Get predictions
    # pred_logreg = model_logreg.predict(X)[0]
    pred_dt = model_dt.predict(X)[0]
    pred_rf = model_rf.predict(X)[0]
    return {'DecisionTree': pred_dt, 'RandomForest': pred_rf}

In [None]:
sequence = input("Enter protein sequence: ")
result = predict_epitope(sequence)
print("Predictions:", result)