In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd

In [None]:
train = pd.read_csv('DF_Train_SHAP.csv')
x_train = train.drop(columns='Final_prediction').values
y_train = train['Final_prediction'].values

test = pd.read_csv('DF_Test_SHAP.csv')
x_test = test.drop(columns='Final_prediction').values
y_test = test['Final_prediction'].values

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

# Define the dimensions
input_dim = x_train.shape[1]
encoding_dim = 122  
num_classes = len(set(y_train)) 

# Input layer
input_layer = Input(shape=(input_dim,))

# Encoder network
encoder = Dense(encoding_dim, activation='relu')(input_layer)

# Decoder network
decoder = Dense(input_dim, activation='sigmoid')(encoder)

# Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Summary of the model
autoencoder.summary()

In [None]:
# Train the autoencoder
history = autoencoder.fit(x_train, x_train,
                          epochs=50,
                          batch_size=60,
                          shuffle=True,
                          validation_data=(x_test, x_test))


In [None]:
# Create a model to get the encoded features
encoder_model = Model(inputs=input_layer, outputs=encoder)

# Extract features from the training and test sets
X_train_encoded = encoder_model.predict(x_train)
X_test_encoded = encoder_model.predict(x_test)

In [None]:
# Define the classifier layer (for binary classification)
classifier_input = Input(shape=(X_train_encoded.shape[1],))
classifier = Dense(1, activation='sigmoid')(classifier_input)

classifier_model = Model(inputs=classifier_input, outputs=classifier)

# Compile the classification model
classifier_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the classification model
classifier_history = classifier_model.fit(X_train_encoded, y_train,
                                          epochs=50,
                                          batch_size=120,
                                          shuffle=True,
                                          validation_data=(X_test_encoded, y_test))

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
classifier_wrapper = KerasClassifier(build_fn=lambda: classifier_model, epochs=10, batch_size=10, verbose=0)

In [None]:
evaluation = classifier_model.evaluate(X_test_encoded, y_test)
print(f"Test Loss: {evaluation[0]}")
print(f"Test Accuracy: {evaluation[1]}")

# Make predictions
predictions = classifier_model.predict(X_test_encoded)

# Convert predictions from one-hot encoded format to class labels
predicted_classes = predictions.argmax(axis=1)
true_classes = y_test

# Print a few examples
print("Predicted classes: ", predicted_classes[:10])
print("True classes: ", true_classes[:10])

from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, log_loss

# Confusion matrix
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

fpr, tpr, thresholds = roc_curve(true_classes, predictions)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Calculate log loss
log_loss_value = log_loss(true_classes, predictions)
print(f"Log Loss: {log_loss_value:.4f}")

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score


stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(classifier_wrapper, X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
reconstructions = autoencoder.predict(x_test)
reconstruction_errors = np.mean((x_test - reconstructions) ** 2, axis=1)

from sklearn.ensemble import RandomForestRegressor

surrogate_model = RandomForestRegressor()
surrogate_model.fit(x_test_df, reconstruction_errors)

import shap

explainer = shap.Explainer(surrogate_model, x_test_df)
shap_values = explainer(x_test_df)

# Visualization
shap.summary_plot(shap_values, x_test_df)
shap.heatmap_plot(shap_values)


In [None]:
shap_df = pd.DataFrame(shap_values.values, columns=x_test_df.columns)

shap_df.to_csv('shap_values.csv', index=False)

In [None]:
# Specify desired features
from matplotlib import pyplot as plt


desired_features = ['antigenicity_1', 'b_cells_probability_score', 'mhci_score', 'mhci_rank', 'mhcii_rank', 'mhcii_score', 'surface_probability', 'signal_peptide_SP', 'signal_peptide_LIPO', 'signal_peptide_TAT', 'signal_peptide_TATLIPO', 'signal_peptide_PILIN', 'signal_peptide_OTHER']

# Find corresponding indices of desired features
feature_indices = [list(x_test_df.columns).index(feature_name) for feature_name in desired_features]

# Extract SHAP values for desired features
shap_values_desired = shap_values[:, feature_indices]

shap.summary_plot(shap_values_desired, x_test_df.iloc[:, feature_indices])
plt.show()

shap.heatmap_plot(shap_values_desired)

shap_desired_df = pd.DataFrame(shap_values_desired.values, columns=desired_features)
shap_desired_df.to_csv('shap_values_biological.csv', index=False)

In [None]:
autoencoder.save('autoencoder_model.keras')

In [None]:
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Train a RandomForestClassifier on the encoded features
clf = RandomForestClassifier()
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))


In [None]:
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn import metrics

# Train a RandomForestClassifier on the encoded features
clf = SVC(probability=True)
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))


In [None]:
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier

from sklearn import metrics

# Train a RandomForestClassifier on the encoded features
clf = GradientBoostingClassifier()
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))



In [None]:
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Train a RandomForestClassifier on the encoded features
clf = LogisticRegression()
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))



In [None]:
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

# Train a RandomForestClassifier on the encoded features
clf = GaussianNB()
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))


In [None]:
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# Train a RandomForestClassifier on the encoded features
clf = DecisionTreeClassifier()
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))


In [None]:
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

# Train a RandomForestClassifier on the encoded features
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_encoded)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {Recall}')

WF1_score = metrics.f1_score(y_test, y_pred)
print(f'WF1 score: {WF1_score}')

y_pred_proba = clf.predict_proba(X_test_encoded)[:, 1]

log_loss_value = metrics.log_loss(y_test, y_pred_proba)
print(f'Log_Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from matplotlib import pyplot as plt
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score

# Define the k-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train RandomForestClassifier on the encoded features
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_encoded, y_train)
rf_pred_proba = rf_clf.predict_proba(X_test_encoded)

# Train GradientBoostingClassifier on the encoded features
# gb_clf = GradientBoostingClassifier()
# gb_clf.fit(X_train_encoded, y_train)
# gb_pred_proba = gb_clf.predict_proba(X_test_encoded)

# Train LogisticRegression on the encoded features
lr_clf = LogisticRegression()
lr_clf.fit(X_train_encoded, y_train)
lr_pred_proba = lr_clf.predict_proba(X_test_encoded)

# Average the predicted probabilities
avg_pred_proba = (rf_pred_proba +  lr_pred_proba) / 2

# Get the final predicted class
avg_pred = np.argmax(avg_pred_proba, axis=1)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, avg_pred)
print(f'Accuracy: {accuracy}')

precision = metrics.precision_score(y_test, avg_pred)
print(f'Precision: {precision}')

recall = metrics.recall_score(y_test, avg_pred)
print(f'Recall: {recall}')

f1_score = metrics.f1_score(y_test, avg_pred)
print(f'F1 score: {f1_score}')

log_loss_value = metrics.log_loss(y_test, avg_pred_proba)
print(f'Log Loss: {log_loss_value}')

fpr, tpr, _ = metrics.roc_curve(y_test, avg_pred_proba[:, 1])
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(clf,X_train_encoded, y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

import joblib
hybrid_model = {
    'Random_Forest': rf_clf,
    'Logistic_Regression': lr_clf,
    'averaged_predictions': avg_pred,
    'averaged_pred_proba': avg_pred_proba
}
model_filename = "eskape_ensemble_model.pkl"
joblib.dump(hybrid_model, model_filename)