# Classification with PYCARET

Using pycaret Machine learning tool to analyse classification models and get the best model to predict the dataset.

## Standard in Kaggle to get data paths

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data preprocessing

### Get data

In [None]:
raw = pd.read_csv('/kaggle/input/epitope-prediction/input_bcell.csv')

raw.head()

### Check the columns in the dataset

In [None]:
raw.columns

### Search for null values

In [None]:
raw.isnull().sum()

There is no missing values in the dataset

### Check if there is some categorical column

In [None]:
for col in raw.columns:
    x = raw[col].unique()
    if len(x) < 20:
        print(f"{col}: {x}")

There are no categorical columns. We are good to go with this dataset.

## Install PYCARET

In [None]:
!pip install pycaret

## Model analysis and selection

### Split dataset to make model validation

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(raw, test_size=0.05)

### Setup experiment

* Use train data
* Remove columns parent_protein_id, protein_seq, peptide_seq because they are not relevant (this analysis is empirical)
* Apply standard normalization

In [None]:
from pycaret.classification import *

experiment = setup(
    data = train 
    ,target = 'target'
    ,ignore_features = ['parent_protein_id', 'protein_seq', 'peptide_seq']
    ,normalize = True
)

### Train several models and select the best in accuracy

Train models using the whole dataset, with 10 K-Fold cross validations (standard pycaret) and choose the best one.

In [None]:
best = compare_models()

### Save the best model as output

In [None]:
save_model(best, 'model')

## Best model Analysis

### AUC

In [None]:
plot_model(best)

### Confusion matrix

In [None]:
plot_model(best, 'confusion_matrix')

### Discrimination Threshold

In [None]:
plot_model(best, 'threshold')

### Precision Recall Curve

In [None]:
plot_model(best, 'pr')

### Class Prediction Error

In [None]:
plot_model(best, 'error')

### Classification Report

In [None]:
plot_model(best, 'class_report')

### Learning Curve

In [None]:
plot_model(best, 'learning')

### Manifold Learning

In [None]:
plot_model(best, 'manifold')

### Calibration Curve

In [None]:
plot_model(best, 'calibration')

### Validation Curve

In [None]:
plot_model(best, 'vc')

### Feature importance

In [None]:
plot_model(best, 'feature')

## Use model to predict over validation data

### Generate a output with current, prediction and score values

In [None]:
test_prediction = predict_model(best, test)
test_prediction = test_prediction.dropna()
test_prediction.to_csv('test_prediction.csv', index=False)
test_prediction

### Correct and incorrect predictions

In [None]:
test_prediction['Label'] = test_prediction['Label'].apply(pd.to_numeric)
test_prediction['comp'] = np.where(test_prediction['target'] == test_prediction['Label'], 'Correct', 'Incorrect')
test_prediction.groupby('comp').count()['Label']

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

y_actu = test_prediction['target']
y_pred = test_prediction['Label']

cm = confusion_matrix(y_actu, y_pred)
cm

In [None]:
import seaborn as sn
sn.heatmap(cm, cmap="Blues", annot=True,annot_kws={"size": 16})

## Validation accuracy

In [None]:
from sklearn.metrics import accuracy_score

print('VALIDATION ACCURACY', accuracy_score(y_actu, y_pred))

## Use model to predict SARS

### Get data and do preprocessing

In [None]:
sars = pd.read_csv('/kaggle/input/epitope-prediction/input_sars.csv')

sars.head()

In [None]:
sars.columns

In [None]:
sars.isnull().sum()

### Prediction

In [None]:
sars_prediction = predict_model(best, sars)
sars_prediction = sars_prediction.dropna()
sars_prediction.to_csv('sars_prediction.csv', index=False)
sars_prediction

### Correct and incorrect predictions

In [None]:
sars_prediction['Label'] = sars_prediction['Label'].apply(pd.to_numeric)
sars_prediction['comp'] = np.where(sars_prediction['target'] == sars_prediction['Label'], 'Correct', 'Incorrect')
sars_prediction.groupby('comp').count()['Label']

## Confusion matrix

In [None]:
y_sars_actu = sars_prediction['target']
y_sars_pred = sars_prediction['Label']

cm_sars = confusion_matrix(y_sars_actu, y_sars_pred)
cm_sars

In [None]:
import seaborn as sn
sn.heatmap(cm_sars, cmap="Blues", annot=True)

## Validation accuracy

In [None]:
print('VALIDATION SARS ACCURACY', accuracy_score(y_sars_actu, y_sars_pred))