In [None]:
# Importando
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Lendo o dataset (bcell)
data = pd.read_csv("../input/epitope-prediction/input_bcell.csv")
data

In [None]:
# Tamanho do dataset
len(data)

In [None]:
# Verificando existência de valores vazios no dataset
data.isnull().sum()

In [None]:
data.hist(grid=False, figsize=(20, 20), bins=40)

In [None]:
zero, um = data['target'].value_counts()

qtde = np.array([zero, um])
elementos = np.array(['0', '1'])

plt.bar(elementos, qtde, width=0.1)

In [None]:
# Verifica-se o desbalanceamento existente
data['target'].value_counts()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include='all')

In [None]:
def length(col):
    new = []
    for i in col:
        new.append(len(i))
    return new

In [None]:
# Transformando as colunas que não eram numéricas e que serão usadas no modelo
data['peptide_seq'] =length(data['peptide_seq'])
data['protein_seq'] = length(data['protein_seq'])

In [None]:
data

In [None]:
data['peptide_seq'].value_counts()

___

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

In [None]:
features = ["protein_seq", "peptide_seq", "chou_fasman","emini","kolaskar_tongaonkar","parker","isoelectric_point","aromaticity","hydrophobicity","stability"]

In [None]:
x = data[features]
y = data['target']

In [None]:
x.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=1)

In [None]:
print("Number instances in features_train dataset: ", x_train.shape)
print("Number instances in Output_train dataset: ", y_train.shape)
print("Number instances in features_test dataset: ", x_test.shape)
print("Number instances in Output_test dataset: ", y_test.shape)

In [None]:
print("Before OverSampling, counts of Recurrent Class '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of No-Recurrent Class '0': {} \n".format(sum(y_train==0)))


sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)


print('After OverSampling, the shape of features_X: {}'.format(x_train_res.shape))
print('After OverSampling, the shape of Output_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of Recurrent Class '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of Non-Recurrent Class '0': {}".format(sum(y_train_res==0)))


In [None]:
#1.Decision Tree Classifier
classifierTree = DecisionTreeClassifier()

#2.Support Vector Classification
classifierSVC = SVC(probability=True)

#3.Random Forest
classifierRandomForest = RandomForestClassifier()

In [None]:
classifierTree.fit(x_train_res, y_train_res)
classifierSVC.fit(x_train_res, y_train_res)
classifierRandomForest.fit(x_train_res, y_train_res)

ModelPredictionsTree = classifierTree.predict(x_test)
ModelPredictionsSVC = classifierSVC.predict(x_test)
ModelPredictionsRandomForest = classifierRandomForest.predict(x_test)

In [None]:
print("Decision Tree Classifier\n")
print("Acurácia: ", accuracy_score(y_test, ModelPredictionsTree))
print("Precisão: ", precision_score(y_test, ModelPredictionsTree))
print("Recall:   ", recall_score(y_test, ModelPredictionsTree))

In [None]:
print("Support Vector Classification\n")
print("Acurácia: ", accuracy_score(y_test, ModelPredictionsSVC))
print("Precisão: ", precision_score(y_test, ModelPredictionsSVC))
print("Recall:   ", recall_score(y_test, ModelPredictionsSVC))

In [None]:
print("Random Forest\n")
print("Acurácia: ", accuracy_score(y_test, ModelPredictionsRandomForest))
print("Precisão: ", precision_score(y_test, ModelPredictionsRandomForest))
print("Recall:   ", recall_score(y_test, ModelPredictionsRandomForest))

In [None]:
ModelPredProbTree = classifierTree.predict_proba(x_test)[::,1]
tree_fpr, tree_tpr, threshold = roc_curve(y_test, ModelPredProbTree)
auc_tree = auc(tree_fpr, tree_tpr)

ModelPredProbSVC = classifierSVC.predict_proba(x_test)[::,1]
svc_fpr, svc_tpr, threshold = roc_curve(y_test, ModelPredProbSVC)
auc_svc = auc(svc_fpr, svc_tpr)

ModelPredProbRF = classifierRandomForest.predict_proba(x_test)[::,1]
rf_fpr, rf_tpr, threshold = roc_curve(y_test, ModelPredProbRF)
auc_rf = auc(rf_fpr, rf_tpr)

plt.figure(figsize=(10, 10), dpi=100)

plt.plot(tree_fpr, tree_tpr, marker=".", label='Decision Tree (auc = %0.3f)' % auc_tree)
plt.plot(svc_fpr, svc_tpr, marker=".", label='Support Vector (auc = %0.3f)' % auc_svc)
plt.plot(rf_fpr, rf_tpr, marker=".", label='Random Forest (auc = %0.3f)' % auc_rf)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend()
plt.show()

---

In [None]:
# Lendo o dataset sars
sars = pd.read_csv("../input/epitope-prediction/input_sars.csv")
sars

In [None]:
# Transformando as colunas não numéricas para serem usadas na predição
sars['peptide_seq'] = length(sars['peptide_seq'])
sars['protein_seq'] = length(sars['protein_seq'])

In [None]:
sars

In [None]:
x_sars = sars[features]
y_sars = sars['target']

In [None]:
# Fazendo as predições do dataset sars com os modelos treinados com o dataset bcell
sars_ModelPredictionsTree = classifierTree.predict(x_sars)
sars_ModelPredictionsSVC = classifierSVC.predict(x_sars)
sars_ModelPredictionsRandomForest = classifierRandomForest.predict(x_sars)

In [None]:
print("Decision Tree Classifier\n")
print("Acurácia: ", accuracy_score(y_sars, sars_ModelPredictionsTree))
print("Precisão: ", precision_score(y_sars, sars_ModelPredictionsTree))
print("Recall:   ", recall_score(y_sars, sars_ModelPredictionsTree))

In [None]:
print("Support Vector Classification\n")
print("Acurácia: ", accuracy_score(y_sars, sars_ModelPredictionsSVC))
print("Precisão: ", precision_score(y_sars, sars_ModelPredictionsSVC))
print("Recall:   ", recall_score(y_sars, sars_ModelPredictionsSVC))

In [None]:
print("Random Forest\n")
print("Acurácia: ", accuracy_score(y_sars, sars_ModelPredictionsRandomForest))
print("Precisão: ", precision_score(y_sars, sars_ModelPredictionsRandomForest))
print("Recall:   ", recall_score(y_sars, sars_ModelPredictionsRandomForest))

In [None]:
lr_probs = classifierRandomForest.predict_proba(x_sars)
lr_probs = lr_probs[:, 1]
lr_auc = roc_auc_score(y_sars, lr_probs)

print('ROC AUC = %.3f' % (lr_auc)) # Por questão de comparação

In [None]:
sars_ModelPredProbTree = classifierTree.predict_proba(x_sars)[::,1]
tree_fpr, tree_tpr, threshold = roc_curve(y_sars, sars_ModelPredProbTree)
auc_tree = auc(tree_fpr, tree_tpr)

sars_ModelPredProbSVC = classifierSVC.predict_proba(x_sars)[::,1]
svc_fpr, svc_tpr, threshold = roc_curve(y_sars, sars_ModelPredProbSVC)
auc_svc = auc(svc_fpr, svc_tpr)

sars_ModelPredProbRF = classifierRandomForest.predict_proba(x_sars)[::,1]
rf_fpr, rf_tpr, threshold = roc_curve(y_sars, sars_ModelPredProbRF)
auc_rf = auc(rf_fpr, rf_tpr)

plt.figure(figsize=(10, 10), dpi=100)

plt.plot(tree_fpr, tree_tpr, marker=".", label='Decision Tree (auc = %0.3f)' % auc_tree)
plt.plot(svc_fpr, svc_tpr, marker=".", label='Support Vector (auc = %0.3f)' % auc_svc)
plt.plot(rf_fpr, rf_tpr, marker=".", label='Random Forest (auc = %0.3f)' % auc_rf)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend()
plt.show()

---