In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
bcell=pd.read_csv('/kaggle/input/epitope-prediction/input_bcell.csv')
bcell.head()

In [None]:
bcell.info()

In [None]:
bcell.describe().T

In [None]:
bcell.isna().sum()

In [None]:
for x in bcell.columns:
    print(f'{x}:{bcell[x].nunique()}')

In [None]:
count=bcell['target'].value_counts().reset_index()
count.columns=['target','count']
count
px.bar(count,x='target',y='count')

In [None]:
# imbalanced dataset

In [None]:

count=bcell['parent_protein_id'].value_counts().reset_index()[:20]
count.columns=['parent_protein_id','count']
count
px.bar(count,x='parent_protein_id',y='count')

In [None]:

count=bcell['protein_seq'].value_counts().reset_index()[:20]
count.columns=['protein_seq','count']
count
px.bar(count,x='protein_seq',y='count')

In [None]:
count=bcell['peptide_seq'].value_counts().reset_index()[:20]
count.columns=['peptide_seq','count']
count
px.bar(count,x='peptide_seq',y='count')

In [None]:
num_vars=[x for x in bcell.columns if bcell[x].dtypes!='O']
num_vars

In [None]:
fig, axes=plt.subplots(6,2, figsize=(20,20))
for i,j in enumerate(num_vars):
    ax=axes[int(i/2), i%2]
    sns.kdeplot(bcell[j], ax=ax)

In [None]:
for i in num_vars:
    fig=px.histogram(bcell, x=i, color='target')
    fig.show()

In [None]:
for i in num_vars:
    fig=px.box(bcell, y=i, color='target')
    fig.show()

In [None]:
bcell.head()

In [None]:
bcell.drop(['parent_protein_id','protein_seq'],1, inplace=True)

In [None]:
bcell.head()

In [None]:
bcell.drop('peptide_seq',1, inplace=True)

In [None]:
px.scatter(bcell, x='start_position', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='end_position', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='chou_fasman', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='emini', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='kolaskar_tongaonkar', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='parker', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='aromaticity', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='hydrophobicity', y='isoelectric_point', color='target')

In [None]:
px.scatter(bcell, x='stability', y='isoelectric_point', color='target')

In [None]:
bcell.head()

In [None]:
y=bcell['target']
X=bcell.drop('target',1)

In [None]:
## oversampling
from imblearn.over_sampling import SMOTE
os=SMOTE()
X_res,y_res=os.fit_sample(X,y)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_res=scaler.fit_transform(X_res)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res, test_size=0.2, random_state=42)

In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
m1=LogisticRegression()
m1.fit(X_train,y_train)
pred_m1=m1.predict(X_test)
score_m1=cross_val_score(m1, X,y, cv=5)
print(score_m1)

In [None]:
report_m1=classification_report(y_test,pred_m1)
print(report_m1)

In [None]:
# decisiontree 

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
m2=DecisionTreeClassifier()
m2.fit(X_train,y_train)
pred_m2=m2.predict(X_test)
score_m2=cross_val_score(m2, X,y, cv=5)
print(score_m2)

In [None]:
report_m2=classification_report(y_test,pred_m2)
print(report_m2)

In [None]:
# randomforest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
m3=RandomForestClassifier()
m3.fit(X_train,y_train)
pred_m3=m3.predict(X_test)
score_m3=cross_val_score(m3, X,y, cv=5)
print(score_m3)

In [None]:
report_m3=classification_report(y_test,pred_m3)
print(report_m3)

In [None]:
cm=confusion_matrix(y_test,pred_m3)
sns.heatmap(cm, annot=True)

In [None]:
### roc_auc score

# auc curve
from sklearn.metrics import auc, roc_curve
fpr,tpr, threshold=roc_curve(y_test, pred_m3)
score=auc(fpr,tpr)
print(score)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % score)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


In [None]:
# naivebayes

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
m4=GaussianNB()
m4.fit(X_train,y_train)
pred_m4=m4.predict(X_test)
score_m4=cross_val_score(m4, X,y, cv=5)
print(score_m4)

In [None]:
report_m4=classification_report(y_test,pred_m4)
print(report_m4)

In [None]:
# xgboost

from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
m5=XGBClassifier()
m5.fit(X_train,y_train)
pred_m5=m5.predict(X_test)
score_m5=cross_val_score(m5, X,y, cv=5)
print(score_m5)

In [None]:
report_m5=classification_report(y_test,pred_m5)
print(report_m5)

In [None]:
#hence AUC score for random_forest is 0.9