# Introduction
Sample code for Tasks 1 and 2 is shown below.

# Preparation

## Load libraries

In [None]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.decomposition import PCA
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

## Load datasets and Quick Look

In [None]:
INPUT_DIR = '../input/epitope-prediction'
bcell = pd.read_csv(f'{INPUT_DIR}/input_bcell.csv')
sars = pd.read_csv(f'{INPUT_DIR}/input_sars.csv')
covid = pd.read_csv(f'{INPUT_DIR}/input_covid.csv')
bcell_sars = pd.concat([bcell, sars], axis=0, ignore_index=True)
bcell_sars.head()

In [None]:
np.sum(bcell_sars.isnull())

 We find:

<!-- - There is little duplication in column `peptide_seq`, but are in `parent_protein_id` and `protein_seq`. -->

- Each row contain information of parent protein, peptide(candidate), and antibody titer.
- The protein ID linked to the protein sequence.
- Length of the peptide sequence is differ from one to another.
- There is no missing value in any columns.
- Target values is binary: 0 or 1.

This data set has many columns to display them all:

```
# tuple(map(len, [bcell_sars.columns, bcell_sars]))
# (columns, rows)
# (14, 14907)
```

Also, this data contain sequence information including peptide and protein. These data size is:

```
# bcell_sars.peptide_seq.nunique(), bcell_sars.protein_seq.nunique()
# (the number of unique peptide , the number of unique protein)
# (14841, 758)
```

# Task1: SARS Prediction with B-cell Data 

## Employed features

- peptide's length: each peptide have different sequence which is part of parent protein sequence, but using sequence data for prediction is not so easy that we use sequence length instead of that.
- chou_fasman: one of peptide features.
- emini: one of peptide features.
- kolaskar_tongaonkar: one of peptide features.
- parker: one of peptide features.
- isoelectric_point: one of protein features.
- aromacity: one of protein features.
- hydrophobicity: one of protein features.
- stability: one of protein features.

## Used model

- Gradient Boosting (LightGBM)

In [None]:
# create length columns
for df in [bcell, sars, covid, bcell_sars]:
    df["length"] = df["end_position"] - df["start_position"] + 1

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [None]:
params = {'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        "verbosity":-1,
        "seed":0,
        "bagging_freq": 3,
        "min_child_samples": 5,
        "bagging_fraction" : 0.632,
        "feature_fraction" : 0.632
         }

## Train model

In [None]:
kf = GroupKFold(n_splits = 5)
oof = np.zeros(len(bcell))
preds = np.zeros(len(sars))
feature_importance = pd.DataFrame()
feature_columns = ["chou_fasman","emini","kolaskar_tongaonkar","parker","length","isoelectric_point","aromaticity","hydrophobicity","stability"]
target = "target"
for i,(train_index,valid_index) in enumerate(kf.split(bcell[feature_columns],bcell["target"],bcell["parent_protein_id"])):
    train_x = bcell.loc[train_index][feature_columns].values
    train_y = bcell.loc[train_index][target].values
    valid_x = bcell.loc[valid_index][feature_columns].values
    valid_y = bcell.loc[valid_index][target].values
    train_data = lgb.Dataset(train_x, label = train_y)
    valid_data = lgb.Dataset(valid_x, label = valid_y)
    model = lgb.train(params, train_data, valid_sets = [valid_data], verbose_eval=20,num_boost_round=500,early_stopping_rounds=50) 
    oof[valid_index] = model.predict(valid_x)
    preds += model.predict(sars[feature_columns].values)/kf.n_splits
    feature_importance["feature"] = feature_columns
    feature_importance["importance"+str(i)] = model.feature_importance()
roc_auc_score(bcell["target"],oof)

## Look feature importance

In [None]:
select = [i != "feature" for i in feature_importance.columns]
select = feature_importance.columns[select]
feature_importance[select] = feature_importance[select]/feature_importance[select].sum()
feature_importance["importance"] = feature_importance.select_dtypes(include=[np.number]).mean(axis=1)
sns.barplot(x="importance", y="feature", data=feature_importance.sort_values(by="importance", ascending=False));
plt.title('GBDT Features (avg over folds)');

## Look classification results

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(sars["target"].values, np.int32(preds >= 0.5)))
print(classification_report(sars["target"].values, np.int32(preds >= 0.5)))

## Visualization with histgram

In [None]:
idx_train = bcell_sars['target'].astype("bool").values
fig, axes = plt.subplots(1, 2,figsize=(14,4))
name = ["label", "predict"]
for i, tar in enumerate([sars["target"], preds]):
    sns.distplot(tar,ax = axes[i])
    axes[i].set_xlabel(name[i],fontsize=12)

# Task2: Covid-19 prediction with B-cell and SARS data

## Employed features

- peptide's length: each peptide have different sequence which is part of parent protein sequence, but using sequence data for prediction is not so easy that we use sequence length instead of that.
- chou_fasman: one of peptide features.
- emini: one of peptide features.
- kolaskar_tongaonkar: one of peptide features.
- parker: one of peptide features.
- isoelectric_point: one of protein features.
- aromacity: one of protein features.
- hydrophobicity: one of protein features.
- stability: one of protein features.

## Used model

- Gradient Boosting (LightGBM)

In [None]:
params = {'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        "verbosity":-1,
        "seed":0,
        "bagging_freq": 3,
        "min_child_samples": 5,
        "bagging_fraction" : 0.632,
        "feature_fraction" : 0.632
         }

## Train model

In [None]:
kf = GroupKFold(n_splits = 5)
oof = np.zeros(len(bcell_sars))
preds = np.zeros(len(covid))
feature_importance = pd.DataFrame()
feature_columns = ["chou_fasman","emini","kolaskar_tongaonkar","parker","length","isoelectric_point","aromaticity","hydrophobicity","stability"]
target = "target"
for i,(train_index,valid_index) in enumerate(kf.split(bcell_sars[feature_columns],bcell_sars["target"],bcell_sars["parent_protein_id"])):
    train_x = bcell_sars.loc[train_index][feature_columns].values
    train_y = bcell_sars.loc[train_index][target].values
    valid_x = bcell_sars.loc[valid_index][feature_columns].values
    valid_y = bcell_sars.loc[valid_index][target].values
    train_data = lgb.Dataset(train_x, label = train_y)
    valid_data = lgb.Dataset(valid_x, label = valid_y)
    model = lgb.train(params, train_data, valid_sets = [valid_data], verbose_eval=20,num_boost_round=500,early_stopping_rounds=50) 
    oof[valid_index] = model.predict(valid_x)
    preds += model.predict(covid[feature_columns].values)/kf.n_splits
    feature_importance["feature"] = feature_columns
    feature_importance["importance"+str(i)] = model.feature_importance()
roc_auc_score(bcell_sars["target"],oof)

## Look feature importance

In [None]:
select = [i != "feature" for i in feature_importance.columns]
select = feature_importance.columns[select]
feature_importance[select] = feature_importance[select]/feature_importance[select].sum()
feature_importance["importance"] = feature_importance.select_dtypes(include=[np.number]).mean(axis=1)
sns.barplot(x="importance", y="feature", data=feature_importance.sort_values(by="importance", ascending=False));
plt.title('GBDT Features (avg over folds)');

In [None]:
covid["target"] = preds
covid.to_csv("sub.csv",index=False)

In [None]:
sns.distplot(covid["target"])
plt.show()

# Exploratory Data Analysis (EDA)

## Peptide feature

In [None]:
idx_train = bcell_sars['target'].astype("bool").values
fig, axes = plt.subplots(2, 2,figsize=(16,8))
axes = [x for a in axes for x in a]
for i,name in enumerate(["chou_fasman","emini","kolaskar_tongaonkar","parker"]):
    value = bcell_sars[name]
    sns.distplot(value[~idx_train],ax = axes[i])
    sns.distplot(value[idx_train],ax = axes[i])
    axes[i].set_xlabel(name,fontsize=12)
    fig.legend(labels = ["target 0","target 1"],loc="right",fontsize=12)

We find:

- The feature value of peptides are largely follow normal distribution except for `emini` which contain outlier.
- There are no significant difference between `target: 0` and `target: 1` in 4 peptide feature in terms of difference of distribution.

Now, we will eliminate feature dimention via PCA as below:

In [None]:
clf = PCA(n_components=2)
z = clf.fit_transform(bcell_sars[["chou_fasman","emini","kolaskar_tongaonkar","parker"]])
plt.figure(figsize=(8, 6))
plt.scatter(*z[idx_train].T,s = 3)
plt.scatter(*z[~idx_train].T,s = 3)
plt.legend(labels=["target_1","target_0"],fontsize=12)
plt.show()

## Protein feature

In [None]:
idx_train = bcell_sars['target'].astype("bool").values
fig, axes = plt.subplots(2, 2,figsize=(16,8))
axes = [x for a in axes for x in a]
for i,name in enumerate(["isoelectric_point", "aromaticity", "hydrophobicity", "stability"]):
    value = bcell_sars[name]
    sns.distplot(value[~idx_train],ax = axes[i])
    sns.distplot(value[idx_train],ax = axes[i])
    axes[i].set_xlabel(name,fontsize=12)
    fig.legend(labels = ["target 0","target 1"],loc="right",fontsize=12)

We find:

- There are no significant difference between `target 0` and `target 1` in 4 peptide features in terms of difference of distribution.

Now, we will also eliminate feature dimention via PCA as below:


In [None]:
clf = PCA(n_components=2)
z = clf.fit_transform(bcell_sars[["isoelectric_point", "aromaticity", "hydrophobicity", "stability"]])
plt.figure(figsize=(8, 6))
plt.scatter(*z[idx_train].T,s = 3)
plt.scatter(*z[~idx_train].T,s = 3)
plt.legend(labels=["target_1","target_0"],fontsize=12)
plt.show()

## Length

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
sns.countplot(bcell_sars["length"],ax=ax,color = "lightblue")
sns.countplot(bcell_sars.query("target == 1")["length"],ax=ax,color = "coral")
plt.legend(labels = ["target 0","target 1"],fontsize=12)
plt.show()

We find:

- Almost of the length of peptides are within 5 to 20 length.
- The longest length of sequence is 393, and the shortest length is 5.
- The number of `target: 1` is larger than that of `target: 0`.