# Covid-19 prediction with pycaret

## Default kaggle get data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Get data

In [None]:
import pandas as pd

# Importing the dataset
bcell = pd.read_csv('/kaggle/input/epitope-prediction/input_bcell.csv')

bcell.head()

In [None]:
sars = pd.read_csv('/kaggle/input/epitope-prediction/input_sars.csv')

sars.head()

In [None]:
bs = pd.concat([bcell, sars], ignore_index=True)

bs

## Split train and test data sets

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(bs, test_size=0.1)

## Install pycaret

In [None]:
!pip install pycaret

## Setup pycaret

In [None]:
from pycaret.classification import *

experiment = setup(
    data = train 
    ,target = 'target'
    ,ignore_features = ['parent_protein_id', 'protein_seq', 'peptide_seq']
    ,normalize = True
)

## Train and compare several ML models

In [None]:
compare_models()

## Check ROC AUC for the best 5 models

### Test all models

In [None]:
from sklearn.metrics import roc_auc_score

best_models = ['et','catboost','xgboost','lightgbm','rf']

df_Results = pd.DataFrame(columns=['Classification', 'Dataset', 'Model', 'AUC'])

for m in best_models:
    
    print('-----------------------------------------------------')
    print('[START] - Processing model: ', m)
    print('-----------------------------------------------------')
    
    mo = create_model(m)
    
    print('-----------------------------------------------------')
    print('[START] - Tunning model: ', m)
    print('-----------------------------------------------------')
    
    tu = tune_model(mo)
    
    print('-----------------------------------------------------')
    print('[START] - Ensemble model: ', m)
    print('-----------------------------------------------------')
    
    en = ensemble_model(mo)
    
    mo_pred = predict_model(mo, test)
    mo_pred.dropna(inplace=True)
    
    tu_pred = predict_model(tu, test)
    tu_pred.dropna(inplace=True)
    
    en_pred = predict_model(en, test)
    en_pred.dropna(inplace=True)
    
    try:
        df_Results.loc[len(df_Results)] = [m, 'Valid', 'Model', roc_auc_score(mo_pred['target'], mo_pred['Label'])]
    except:
        df_Results.loc[len(df_Results)] = [m, 'Valid', 'Model', 'NA']
    try:
        df_Results.loc[len(df_Results)] = [m, 'Valid', 'Tunned', roc_auc_score(tu_pred['target'], tu_pred['Label'])]
    except:
        df_Results.loc[len(df_Results)] = [m, 'Valid', 'Tunned', 'NA']
    try:
        df_Results.loc[len(df_Results)] = [m, 'Valid', 'Ensembled', roc_auc_score(en_pred['target'], en_pred['Label'])]
    except:
        df_Results.loc[len(df_Results)] = [m, 'Valid', 'Ensembled', 'NA']
    
    print('-----------------------------------------------------')
    print('[FINISHED] - Model: ', m)
    print('-----------------------------------------------------')

### Cehck best ROC AUC

In [None]:
print(df_Results.sort_values(by=['Dataset', 'AUC'], ascending=False))

## Create the best model

In [None]:
rf = create_model('rf')

In [None]:
rf_tunned = tune_model(rf)

## Check ROC AUC for the best model

In [None]:
plot_model(rf_tunned)

## Check confusion matrix for the best model

In [None]:
plot_model(rf_tunned, 'confusion_matrix')

## Discrimination Threshold

In [None]:
plot_model(rf_tunned, 'threshold')

## Class prediction error

In [None]:
plot_model(rf_tunned, 'error')

## Classification report

In [None]:
plot_model(rf_tunned, 'class_report')

## Feature importance

In [None]:
plot_model(rf_tunned, 'feature')

## Get Covid-19 data

In [None]:
covid = pd.read_csv('/kaggle/input/epitope-prediction/input_covid.csv')

covid.head()

## Run the model to make predictions over Covid-19 dataset

In [None]:
pred_covid = predict_model(rf_tunned, covid)

pred_covid

## Submit task

In [None]:
pred_covid.to_csv('submit.csv', index=False)