In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading the dataset for task 1
b=pd.read_csv('/kaggle/input/epitope-prediction/input_bcell.csv')
b.head()

In [None]:
b.columns

In [None]:
b.isnull().sum()

There are no missing values in the dataset.

In [None]:
b['target'].value_counts()/len(b)*100.0

72.8% target values i.e. Anitbody valence of this dataset is negative and 27.1% of values are positive.It means that most of the antibodies can resist binding of SARS-COV virus with themselves.

In [None]:
b.info()

**INSIGHTS-**
* 'parent_protein_id','protein_seq',and 'peptide_seq' are of object type as they contain characters not numbers.
*  Rest of the features are of float type.
*  No categorical feature is present in the dataset.
*  Target feature is binary i.e. containing only 0 and 1.


In [None]:
#for statiscal analysis of continuous variables
b.describe()

**INSIGHTS-**
* By looking at the huge difference between 75% quartile value and maximum value of 'start_position','emini' and 'end_position' features there can be a possibility of outliers in these features.
* Minimum values in 'parker' and 'hydrophobicity' features are negative.
* Negative mean,25th,50th and 75th quartiles in 'hydrophobicity' feature.

In [None]:
#for statistical analysis of object variables
b.describe(include='all')

**INSIGHTS-**
* 'parent_protein_id' contains 760 unique values with 560 frequency.
* 'protein_seq' contains 757 unique values with same frequency as of 'parent_protein_id'.
* 'peptide_seq' contains the most number of unique values.
*  We can say that 'parent_protein_id' and 'protein_seq' contains mostly same number of characteristics. 

In [None]:
#to calculate peptide length
b['peptide_length']=b['end_position'] - b['start_position'] + 1

In [None]:
b.head()

In [None]:
#function to convert characters into their lengths
def length(col):
    for i in col:
        return len(i)

In [None]:
#converting all the three object type features
b['parent_protein_id']=length(b['parent_protein_id'])

In [None]:
b['protein_seq']=length(b['protein_seq'])

In [None]:
b['peptide_seq']=length(b['peptide_seq'])

In [None]:
x=b.drop(columns='target')
y=b['target']

In [None]:
#feature importance
from sklearn.ensemble import ExtraTreesClassifier
r = ExtraTreesClassifier(random_state=0)
r.fit(x,y)
feature_importance = r.feature_importances_
feature_importance_normalized = np.std([tree.feature_importances_ for tree in 
                                        r.estimators_], 
                                        axis = 0) 

In [None]:
#importing libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px#dropping unnecessary columns
b.drop(columns=['parent_protein_id','protein_seq','peptide_seq'],inplace=True)
plt.figure(figsize=(10,10))
sns.barplot(feature_importance_normalized,x.columns) 
plt.xlabel('Feature Labels') 
plt.ylabel('Feature Importances') 
plt.title('Comparison of different Feature Importances') 
plt.show() 

According to the ExtraTreesClassifier,protein features **'stability','hydorphobicity','aromaticity'** and **'isoelectric_point'** conveys most info about the target feature determining their importance in the dataset.

In [None]:
b.head()

In [None]:
b['peptide_length'].value_counts()/len(b)*100

Most of the peptides are of length 15(32%),10(26%) and 8(15%) respectively.


## EXPLORATORY DATA ANALYSIS

In [None]:
features=["chou_fasman","emini","kolaskar_tongaonkar","parker","peptide_length","isoelectric_point","aromaticity",
            "hydrophobicity","stability"]
plt.figure(figsize=(20,20))
plt.subplots_adjust(hspace=2.0)
j=1
for i in features:
    plt.subplot(4,5,j)
    sns.distplot(b[i])
    j+=1

**INSIGHTS**-
* 'Emini' feature shows right skewed distribution.
* 'peptide_length','isoelectric_point','aromaticity','hydrophobicity','stability' are not perfectly normal and contains outliers.
* 'chou_fasman','kolaskar_tongaonkar','parker' shows near-to-perfect normal distribution.

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(30,30))
sns.catplot(y='isoelectric_point',x='peptide_length',data=b,ci=None,col='target',sharey=False)

**INSIGHTS-**
* The number of peptides of length 8,10 and 15 have certainly strong values of isoelectric point at range above 7 at negative target.
* In another figure,most of the values of peptide_length have greater range of values below 7 having low isoelectric point at positive target.

In [None]:
plt.figure(figsize=(30,30))
sns.catplot(y='aromaticity',x='peptide_length',data=b,ci=None,col='target',sharey=False)

**INSIGHTS-**
* Most of the peptides have range of aromaticity within range of 0.05 to 0.10 at negative target.
* Most of the peptides have range of aromaticity within range of 0.04 to 0.12 at positive target.

In [None]:
plt.figure(figsize=(30,30))
sns.catplot(y='hydrophobicity',x='peptide_length',data=b,ci=None,col='target',sharey=False)

Here most of the peptides showing strong hydrophobicity property are at positive target than negative target.

In [None]:
plt.figure(figsize=(30,30))
sns.catplot(y='stability',x='peptide_length',data=b,ci=None,col='target',sharey=False)

**INSIGHTS-**
* Most of the peptides show stability within range 20 to 60 at positive target.
* Most of the peptides show stability within range 20 to 80 at negative target

In [None]:
x.head()

## MODEL BUILDING

In [None]:
X=b.drop(columns='target')
Y=b['target']

In [None]:
#train and test
from sklearn.model_selection import train_test_split, RandomizedSearchCV
X_train,X_valid,Y_train,Y_valid=train_test_split(X,Y,stratify=Y,test_size=0.2,random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler
d=MinMaxScaler()
d.fit_transform(X_train,Y_train)

In [None]:
d.transform(X_valid)

In [None]:
#fitting the lightbgm model 
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

In [None]:
l=LGBMClassifier(random_state=10)
l.fit(X_train,Y_train)

In [None]:
lg_train=l.predict(X_train)

In [None]:
roc_auc_score(lg_train,Y_train)

In [None]:
lg_pred=l.predict(X_valid)
lg_pred

In [None]:
roc_auc_score(lg_pred,Y_valid)

The model was trained at training ROC score of 0.9022 whereas testing score is 0.8420,which means model has fitted pretty well and had made good predictions on validation set.

In [None]:
#predictions of validation dataset
predictions=pd.DataFrame(lg_pred,columns=['validation_pred'])
predictions.head()

In [None]:
#predicting on covid dataset
c=pd.read_csv('/kaggle/input/epitope-prediction/input_covid.csv')
c.head()

In [None]:
c.info()

In [None]:
c.drop(columns=['parent_protein_id','protein_seq','peptide_seq'],inplace=True)

In [None]:
c.head()

In [None]:
c.isnull().sum()

In [None]:
c['length']=c['end_position']-c['start_position'] + 1

In [None]:
d.transform(c)

In [None]:
y_pred=l.predict(c)
y_pred

In [None]:
y_pred=pd.DataFrame(y_pred,columns=['test_pred'])
y_pred.head()

In [None]:
y_pred.value_counts()/len(c)*100

The test predicitons shows us that the antibody valence will be negative around 55.6% and positive around 44.3% which means that majority of antibodies will resist binding of virus like SARS-Cov which will reduce number of cases.

**If you like this notebook do upvote it.**

Do provide your valuable feedback.

Do checkout my other notebooks at https://www.kaggle.com/tmchls