In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#loading the two datasets
b=pd.read_csv('/kaggle/input/epitope-prediction/input_bcell.csv')
s=pd.read_csv('/kaggle/input/epitope-prediction/input_sars.csv')

In [None]:
#combining the two datasets
c=pd.concat([b,s])
c.head()

In [None]:
c.info()

**INSIGHTS**-

* 'parent_protein_id','protein_seq',and 'peptide_seq' are of object type as they contain characters not numbers.
* Rest of the features are of float type.
* No categorical feature is present in the dataset.
* Target feature is binary i.e. containing only 0 and 1.

In [None]:
c.describe()

**INSIGHTS-**

* By looking at the huge difference between 75% quartile value and maximum value of 'start_position','emini' and 'end_position' features there can be a possibility of outliers in these features.
* Minimum values in 'parker' and 'hydrophobicity' features are negative.
* Negative mean,25th,50th and 75th quartiles in 'hydrophobicity' feature.

In [None]:
#for statistical analysis of object variables
c.describe(include='all')

**INSIGHTS-**
* 'parent_protein_id' contains 761 unique values with 560 frequency.
* 'protein_seq' contains 758 unique values with same frequency as of 'parent_protein_id'.
* 'peptide_seq' contains the most number of unique values.
*  We can say that 'parent_protein_id' and 'protein_seq' contains mostly same number of characteristics. 

In [None]:
c['target'].value_counts()/len(c)*100

In [None]:
#to calculate peptide length
c['peptide_length']=c['end_position'] - c['start_position'] + 1

In [None]:
#function to convert characters into their lengths
def length(col):
    for i in col:
        return len(i)

In [None]:
#converting all the three object type features
c['parent_protein_id']=length(c['parent_protein_id'])

In [None]:
c['protein_seq']=length(c['protein_seq'])

In [None]:
c['peptide_seq']=length(c['peptide_seq'])

In [None]:
x=c.drop(columns='target')
y=c['target']

In [None]:
#feature importance
from sklearn.ensemble import ExtraTreesClassifier
r = ExtraTreesClassifier(random_state=0)
r.fit(x,y)
feature_importance = r.feature_importances_
feature_importance_normalized = np.std([tree.feature_importances_ for tree in 
                                        r.estimators_], 
                                        axis = 0) 

In [None]:
#importing libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
plt.figure(figsize=(10,10))
sns.barplot(feature_importance_normalized,x.columns) 
plt.xlabel('Feature Labels') 
plt.ylabel('Feature Importances') 
plt.title('Comparison of different Feature Importances') 
plt.show() 

According to the ExtraTreesClassifier,protein features **'hydorphobicity','aromaticity'** and **'isoelectric_point'** conveys most info about the target feature determining their importance in the dataset.

In [None]:
#dropping unnecessary columns
c.drop(columns=['parent_protein_id','protein_seq','peptide_seq'],inplace=True)

In [None]:
c.head()

In [None]:
c['peptide_length'].value_counts()/len(c)*100

Most of the peptides are of length 15(32.3%),10(25.1%) and 8(14.4%) respectively.

In [None]:
features=["chou_fasman","emini","kolaskar_tongaonkar","parker","peptide_length","isoelectric_point","aromaticity",
            "hydrophobicity","stability"]
plt.figure(figsize=(20,20))
plt.subplots_adjust(hspace=2.0)
j=1
for i in features:
    plt.subplot(4,5,j)
    sns.distplot(c[i])
    j+=1

**INSIGHTS**-
* 'Emini' and 'peptide_length' feature shows right skewed distribution.
* 'isoelectric_point','aromaticity','hydrophobicity','stability' are not perfectly normal and contains outliers.
* 'chou_fasman','kolaskar_tongaonkar','parker' shows near-to-perfect normal distribution.

## MODEL BUILDING

In [None]:
X=c.drop(columns='target')
Y=c['target']

In [None]:
#train and test
from sklearn.model_selection import train_test_split, RandomizedSearchCV
X_train,X_valid,Y_train,Y_valid=train_test_split(X,Y,stratify=Y,test_size=0.2,random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler
d=MinMaxScaler()
d.fit_transform(X_train,Y_train)

In [None]:
d.transform(X_valid)

In [None]:
#fitting the lightbgm model 
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
params ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
l=LGBMClassifier()

In [None]:
w=RandomizedSearchCV(l,param_distributions=params,n_jobs=-1,cv=5,scoring='roc_auc')
w.fit(X_train,Y_train)

In [None]:
lg_pred=w.predict(X_valid)
lg_pred

In [None]:
roc_auc_score(lg_pred,Y_valid)

The ROC score of training data is 0.872 and of validation data is 0.834 which shows that the model has fitted very well in validation data. 

In [None]:
lg_train=w.predict(X_train)

In [None]:
roc_auc_score(lg_train,Y_train)

In [None]:
#predictions of validation dataset
predictions=pd.DataFrame(lg_pred,columns=['validation_pred'])
predictions.head()

In [None]:
predictions.value_counts()/len(c)*100

In [None]:
#predicting on covid dataset
co=pd.read_csv('/kaggle/input/epitope-prediction/input_covid.csv')
co.head()

In [None]:
co.info()

In [None]:
co.drop(columns=['parent_protein_id','protein_seq','peptide_seq'],inplace=True)

In [None]:
co.head()

In [None]:
co.isnull().sum()

In [None]:
co['length']=co['end_position']-co['start_position'] + 1

In [None]:
d.transform(co)

In [None]:
y_pred=w.predict(co)
y_pred

In [None]:
y_pred=pd.DataFrame(y_pred,columns=['test_pred'])
y_pred.head()

In [None]:
y_pred.value_counts()/len(co)*100

The test predicitons shows us that the antibody valence will be negative around 56.3% and positive around 43.6% which means that majority of antibodies will resist binding of virus like SARS-Cov2 which will reduce number of cases.

**Task 1 prediction with only B-cell data was 55.6% negative cases and 44.3% positive cases.**

**Task 2 with both B-cell and SARS data has predictions of 52.8% negative cases and 47.1% positive cases.**

**If you like this notebook do upvote it.**

Do provide your valuable feedback.

Do checkout my other notebooks at https://www.kaggle.com/tmchls