In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2 style=color:blue align="left"> Table of Conents </h2>

#### 1) What is PyCaret and Why Should you Use it?
#### 2) Installing PyCaret on your Machine
#### 3) Accessing Data
>    3.1) Loading a Dataframe with Pandas

>    3.2) Using the Data Repository

>    3.3) Experiment Setup

#### 4) Compare Baseline Models

#### 5) Train and tune specific models

#### 6) Combine Models ( Optional )

#### 7) AutoML ( Optional ) 

#### 8) Classification Example
>    8.1) Dataset: Diabetes

#### 9) Regression Example

>    9.1) Dataset: Bostan

#### 10) Import Dataset: juice

>    10.1) Setting up Environment

>    10.2) Compare Models

>         10.2.1) Default

>         10.2.2) Sorted Method 

>         10.2.3) n_select parameter

>         10.2.4) Whitelist parameter

>    10.3) Create Model

>    10.4) Tune Model

>    10.5) Building Ensemble Models using PyCaret

>    10.6) Blend Models

>    10.7) Analyze Model 

>    10.8) Evaluate our Model

>    10.9) Interpret Model

>    10.10) Make Predictions

>    10.11) Save and load the model

>    10.12) Deploy Model

<h2 style=color:blue align="left"> Reference </h2>

> https://pycaret.org/compare-models/

> https://www.kaggle.com/discussion/234790

> https://www.youtube.com/watch?v=jlW5kRBwcb0

> https://www.youtube.com/watch?v=BjcpOVQhNlc&t=74s

> https://www.youtube.com/watch?v=TXOLlgzAdxM&t=4s

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:center; border-radius: 15px 50px;"> 1) What is PyCaret and Why Should you Use it? </h1>

- PyCaret is an open-source, machine learning library in Python that helps you from data preparation to model deployment. It is easy to use and you can do almost every data science project task with just one line of code.

- PyCaret, being a low-code library, makes you more productive. You can spend less time on coding and can do more experiments

- It is an easy to use machine learning library that will help you perform end-to-end machine learning experiments, whether that’s imputing missing values, encoding categorical data, feature engineering, hyperparameter tuning, or building ensemble models

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:center; border-radius: 15px 50px;"> 2) Installing PyCaret on your Machine </h1>

In [None]:
# run this cell to install pycaret in Google Colab
# !pip install pycaret 

In [None]:
# If you are using jupyter notebook, you can pip install pycaret using jupyter notebook or command line
# pip install pycaret

In [None]:
!pip install pycaret

In [None]:
from pycaret.utils import version
version()

In [None]:
import pycaret
print('Using PyCaret Version', pycaret.__version__)
print('Path to PyCaret: ', pycaret.__file__)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 3) Accessing Data </h1>

#### There are two ways to register your data into PyCaret:

> Loading a Dataframe with Pandas

> Using the Data Repository

<h3 style=color:green align="left"> 3.1) Loading a Dataframe with Pandas </h3>
- The first way to get data into PyCaret is simply to load up a Pandas dataframe and then pass it to PyCaret.

        data = pd.read_csv(data_path)
        data.head()
        
<h3 style=color:green align="left"> 3.2) Using the Data Repository </h3>
- The second way of getting data, which is used in the PyCaret tutorials, is to pull in a curated dataset from the PyCaret Data Repository. The repository helpfully includes popular sample datasets for classification, regression, clustering, NLP, etc.

        all_datasets = pycaret.datasets.get_data('index')

In [None]:
# The repository contained 56 datasets
from pycaret.datasets import get_data
all_datasets = pycaret.datasets.get_data('index')

In [None]:
# all_datasets = pycaret.datasets.get_data('index')
dataset_name = 'heart_disease' # Replace with your desired dataset.
data = pycaret.datasets.get_data(dataset_name)

<h3 style=color:green align="left"> 3.3) Experiment Setup </h3>

- Many often-tedious preprocessing steps are taken care of automatically in PyCaret, which standardizes and conveniently packages fundamental data preparation steps into repeatable time-saving workflows.  Users are able to **automate cleaning (e.g. handling missing values with various imputation methods available), splitting into train and test sets, as well as some aspects of feature engineering and training.**  While many of the objects created in this process aren’t explicitly shown to the user (such as train and test sets, or label vectors), they are accessible if needed or desired by more experienced practitioners. 

In [None]:
from pycaret.classification import *

clf1 = setup(data=data, 
             target = 'Disease',                # Use your target variable.
             session_id=123, 
             log_experiment=True, 
             experiment_name='experiment1',     # Use any experiment name.
             silent=True                        # Runs the command without user input. 
            )

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 4) Compare Baseline Models </h1>

- In a single line of code, we can train and compare baseline versions of all available models on our dataset:

       best_model = compare_models()
       

- This trains a baseline version of each available model type and yields a detailed comparison of metrics for the trained models, and highlights the best results across models.

- Note that we did not have to do any data preparation by hand — we just needed to make the data available as a CSV, and run the setup function.  Behind the scenes of those two setup steps, the data was passed into PyCaret and transformed to the extent necessary to train and evaluate the available models.  To see what models PyCaret knows about, we can run which returns a dataframe of all available models, their proper names, the reference package that they’re drawn from (e.g. sklearn.linear_model._logistic.LogisticRegression), and whether Turbo is supported (a mode that limits the model training time, which may be desirable for rapid comparisons).

      models()

In [None]:
models()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 5) Train and tune specific models </h1>

- From **compare_models**, we were easily able to see the **best baseline models for each metric**, and select those for further investigation.

- For example, if we were looking for the model with the **highest AUC** above, we would have elected to continue with **random forest**.  We can then save and fine tune our model using the **create_model and tune_model** functions. 

In [None]:
rf = create_model('rf', fold = 5)

In [None]:
Tuned_rf = tune_model(rf)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 6) Combine Models ( Optional ) </h1>

- We can combine our trained models in various ways.  First, we can create ensemble models with methods such as **bagging (bootstrap aggregating) and boosting.**  Both bagging and boosting are invoked with the ensemble_model function.  We can further apply **blending and stacking methods** to combine diverse models, or estimators — a list of estimators can be passed to blend_models or stack_models.  If desired, one could create ensemble models and combine them via blending or stacking, all in a single line of code.  For clarity, we’ll show an example in which each of these four methods is shown sequentially in its own cell, which also allows us to see the default output from PyCaret when each of these methods is used.  

In [None]:
dt = create_model('dt', fold = 5)

In [None]:
# Creating a bagged decision tree ensemble model
bagged_dt = ensemble_model(dt)

In [None]:
# Creating a boosted decision tree ensemble model
boosted_dt = ensemble_model(dt, method='Boosting')

In [None]:
# Blending estimators
blender = blend_models(estimator_list = [boosted_dt, bagged_dt, Tuned_rf], method = 'soft')

In [None]:
# Stacking bagged, boosted, and tuned estimators
stacker = stack_models(estimator_list = [boosted_dt,bagged_dt,Tuned_rf], meta_model=rf)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 7) AutoML ( Optional ) </h1>

- Quick and painless tuning for a particular metric can be accomplished using the AutoML feature.

- AutoML techniques generally reduce the human oversight of the model selection process, which may not be ideal or appropriate in many contexts, they can be a useful tool to quickly identify the highest performing option for a particular purpose. 

In [None]:
# Select the best model based on the chosen metric
best = automl(optimize = 'AUC')
best

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 8) Classification Example </h1>

<h1 style="background-color:yellow; font-family:newtimeroman; font-size:180%; text-align:left;"> 8.1) Dataset: Diabetes </h1>

        # Importing dataset
          from pycaret.datasets import get_data
          diabetes = get_data('diabetes')

        # Importing module and initializing setup
          from pycaret.classification import *
          clf1 = setup(data = diabetes, target = 'Class variable')

        # return best model
          best = compare_models()

        # return top 3 models based on 'Accuracy'
          top3 = compare_models(n_select = 3)

        # return best model based on AUC
          best = compare_models(sort = 'AUC') #default is 'Accuracy'

        # compare specific models
          best_specific = compare_models(include = ['dt','rf','xgboost'])

        # blacklist certain models
          best_specific = compare_models(exclude = ['catboost', 'svm'])

In [None]:
# Importing dataset
diabetes = get_data('diabetes')

In [None]:
# Importing module and initializing setup
from pycaret.classification import *
clf1 = setup(data = diabetes, target = 'Class variable')

In [None]:
# return best model
best = compare_models()

In [None]:
# return top 3 models based on 'Accuracy'
top3 = compare_models(n_select = 3)

In [None]:
top3

In [None]:
# return best model based on AUC
best = compare_models(sort = 'AUC') # default is 'Accuracy'

In [None]:
# compare specific models
best_specific = compare_models(include = ['dt','rf','xgboost'])

In [None]:
# blacklist certain models
best_specific = compare_models(exclude = ['catboost', 'svm'])

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 9) Regression Example </h1>

<h1 style="background-color:yellow; font-family:newtimeroman; font-size:180%; text-align:left;"> 9.1) Dataset: Bostan </h1>

        # Importing dataset
          from pycaret.datasets import get_data
          boston = get_data('boston')

        # Importing module and initializing setup
          from pycaret.regression import *
          reg1 = setup(data = boston, target = 'medv')

        # return best model
          best = compare_models()

        # return top 3 models based on 'R2'
          top3 = compare_models(n_select = 3)

        # return best model based on MAPE
          best = compare_models(sort = 'MAPE') #default is 'R2'

        # compare specific models
          best_specific = compare_models(include = ['dt','rf','xgboost'])

        # blacklist certain models
          best_specific = compare_models(exclude = ['catboost', 'svm'])

In [None]:
# Importing dataset
from pycaret.datasets import get_data
boston = get_data('boston')

In [None]:
# Importing module and initializing setup
from pycaret.regression import *
reg1 = setup(data = boston, target = 'medv')

In [None]:
# return best model
best = compare_models()

In [None]:
best

In [None]:
# return top 3 models based on 'R2'
top3 = compare_models(n_select = 3)

In [None]:
top3

In [None]:
# return best model based on MAPE
best = compare_models(sort = 'MAPE') #default is 'R2'

In [None]:
# compare specific models
best_specific = compare_models(include = ['dt','rf','xgboost'])

In [None]:
# blacklist certain models
best_specific = compare_models(exclude = ['catboost', 'svm'])

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:center; border-radius: 15px 50px;"> 10) Import Dataset: juice </h1>

In [None]:
from pycaret.datasets import get_data
data = get_data('juice')

In [None]:
# All available datasets in PyCaret
get_data('index')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.1) Setting up Environment </h1>

### Importing a Module:
- Depending upon the type of problem you are going to solve, you first need to import the module.
- In the first version of PyCaret, 6 different modules are available:

> 1) Regression

> 2) Classification

> 3) Clustering

> 4) Natural language processing (NLP)

> 5) Anomaly detection

> 6) Associate mining rule.
- In this article, we will solve a **classification problem** and hence we will import the classification module

### Initializing the Setup:
- In this step, PyCaret performs some basic **preprocessing** tasks:

> Ignoring the IDs and Date Columns

> Imputing the missing values

> Encoding the categorical variables

> Splitting the dataset into the train-test split for the rest of the modeling steps.
- When you run the setup function, it will first confirm the data types, and then if you press enter, it will create the environment for you to go ahead

In [None]:
from pycaret.classification import *
clf1 = setup(data, target='Purchase', session_id=786)

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.2) Compare Models </h1>

##### This is another useful function of the PyCaret library. If you do not want to try the different models one by one, you can use the compare models function and it will train and compare common evaluation metrics for all the available models in the library of the module you have imported.

#### This function is only available in:
> pycaret.classification

> pycaret.regression

- This function trains **all the models in the model library** using **default hyperparameters** and evaluates performance metrics using cross-validation. It returns the trained model object. The evaluation metrics used are:

 - **Classification:** Accuracy, AUC, Recall, Precision, F1, Kappa, MCC

 - **Regression:** MAE, MSE, RMSE, R2, RMSLE, MAPE
 
- The output of the function is a table showing the averaged score of all models across the folds. The number of folds can be defined using the **fold** parameter within the **compare_models** function. By default, the **fold is set to 10.** The table is sorted (highest to lowest) by the metric of choice and can be defined using the **sort** parameter. By default, the table is sorted by **Accuracy** for classification experiments and R2 for regression experiments. Certain models are prevented from the comparison because of their longer run-time. In order to bypass this prevention, the turbo parameter can be set to False.

<h1 style="background-color:DeepSkyBlue; font-family:newtimeroman; font-size:170%; text-align:left;"> 10.2.1) Default </h1>

In [None]:
# compare performance of different classification models
compare_models()

In [None]:
best_model = compare_models()

In [None]:
best_model

##### In default compare models, which model highlights more yellow is the best model.

<h1 style="background-color:DeepSkyBlue; font-family:newtimeroman; font-size:170%; text-align:left;"> 10.2.2) Sorted Method </h1>

In [None]:
best_model = compare_models(sort='Recall')

In [None]:
best_model

#### Now Recall is the priority, so whichever model shows high recall value that becomes best model

<h1 style="background-color:DeepSkyBlue; font-family:newtimeroman; font-size:170%; text-align:left;"> 10.2.3) n_select parameter </h1>

In [None]:
top5 = compare_models(n_select=5)

In [None]:
top5

<h1 style="background-color:DeepSkyBlue; font-family:newtimeroman; font-size:170%; text-align:left;"> 10.2.4) Whitelist parameter </h1>

In [None]:
# This will only train 4 models as mentioned in whitelist parameter.
w = compare_models(include = ['dt', 'rf', 'xgboost', 'lightgbm'])

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.3) Create Model </h1>

### Training a Model
- Training a model in PyCaret is quite simple. You just need to use the create_model function that takes just the one parameter – the model abbreviation as a string. Here, we are going to first train a **decision tree model** for which we have to pass **“dt”** and it will return a **table with k-fold cross-validated scores** of common evaluation metrics used for classification models.

- Here’s q quick reminder of the evaluation metrics used for supervised learning:

 - **Classification:** Accuracy, AUC, Recall, Precision, F1, Kappa
 - **Regression:** MAE, MSE, RMSE, R2, RMSLE, MAPE

In [None]:
# build the Logistic Regression model
# default number of folds =10
lr = create_model('lr')

In [None]:
# Checking for number of folds =5
lr = create_model('lr', fold=5)

In [None]:
# build the decision tree model
dt = create_model('dt')

In [None]:
# build the Naive Bayes model
nb = create_model('nb')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.4) Tune Model </h1>

### Hyperparameter Tuning
- We can tune the hyperparameters of a machine learning model by just using the **tune_model** function which takes one parameter – the model abbreviation string (the same as we used in the create_model function).

- PyCaret provides us a lot of flexibility. For example, we can define the number of folds using the **fold** parameter within the **tune_model** function. Or we can change the number of iterations using the **n_iter** parameter. Increasing the **n_iter** parameter will obviously increase the training time but will give a much better performance.

In [None]:
# build and tune the Decision Tree model
tuned_dt = tune_model(dt)

### we can compare below two parameters for with default parameter & hyperparameters 

In [None]:
dt

In [None]:
tuned_dt

In [None]:
tuned_nb = tune_model(nb, optimize='AUC')

In [None]:
tuned_nb

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.5) Building Ensemble Models using PyCaret </h1>

#### Ensemble models in machine learning combine the decisions from multiple models to improve the overall performance.

### In PyCaret, we can create bagging, boosting, blending, and stacking ensemble models with just one line of code.

In [None]:
# default n_estimators=10
bagged_dt = ensemble_model(dt)

In [None]:
# set n_estimators=25
bagged_dt = ensemble_model(dt, n_estimators=25)

In [None]:
# ensemble boosting
boosted_dt = ensemble_model(dt, method='Boosting')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.6) Blend Models </h1>

In [None]:
lr = create_model('lr', verbose=False)
lda = create_model('lda', verbose=False)
gbc = create_model('gbc', verbose=False)

In [None]:
# Ensemble: blending
blender = blend_models(estimator_list=[lr, lda, gbc], method='soft')

In [None]:
blender.estimators_

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.7) Analyze Model </h1>

##### Now, after training the model, the next step is to analyze the results. This especially useful from a business perspective, right? Analyzing a model in PyCaret is again very simple. Just a single line of code and you can do the following:

### Plot Model Results:
- Analyzing model performance in PyCaret is as simple as writing **plot_model.**

- You can plot **decision boundaries, precision-recall curve, validation curve, residual plots, etc..**

- **clustering** models, you can plot the **elbow plot and silhouette plot.**

- **text data**, you can plot **word clouds, bigram and trigram frequency plots,** etc.

### Interpret Results:
- Interpreting model results helps in debugging the model by analyzing the important features. This is a crucial step in industry-grade machine learning projects. In PyCaret, we can interpret the model by **SHAP values and correlation plot** with just one line of code.

In [None]:
# AUC-ROC plot
# plot_model(blender) (or) plot_model(blender, plot = 'auc') --> both gives same result
plot_model(blender, plot = 'auc')

In [None]:
plot_model(blender, plot='confusion_matrix')

In [None]:
plot_model(blender, plot='threshold')

In [None]:
# Precision Recall Curve
plot_model(blender, plot='pr')

In [None]:
# Validation Curve
plot_model(tuned_dt, plot='vc')

In [None]:
# Decision Boundary for "Decision tree"
plot_model(dt, plot='boundary')

In [None]:
# Decision Boundary for "Naive Baeys"
plot_model(tuned_nb, plot='boundary')

In [None]:
# Decision Boundary for "Blending"
plot_model(blender, plot='boundary')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.8) Evaluate our Model </h1>

- If you do not want to plot all these visualizations individually, then the PyCaret library has another amazing function: **evaluate_model**. In this function, you just need to pass the model object and PyCaret will create an interactive window for you to see and analyze the model in all the possible ways:

In [None]:
# evaluate model
evaluate_model(boosted_dt)

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.9) Interpret Model </h1>

- Interpreting complex models is very important in most machine learning projects. It helps in debugging the model by analyzing what the model thinks is important. In PyCaret, this step is as simple as writing **interpret_model** to get the Shapley values.

In [None]:
xgboost = create_model('xgboost')

In [None]:
# interpret_model: SHAP
interpret_model(xgboost)

In [None]:
# interpret model : Correlation
interpret_model(xgboost, plot='correlation')

In [None]:
interpret_model(xgboost, plot='reason', observation=1)

In [None]:
interpret_model(xgboost, plot='reason')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.10) Make Predictions </h1>

#### read the test data
test_data_classification = pd.read_csv('datasets/loan_test_data.csv')

#### make predictions
predictions = classification.predict_model(classification_dt, data=test_data_classification)

#### view the predictions
predictions

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.11) Save and Load the Model </h1>

- Now, once the model is built and tested, we can save this in the pickle file using the save_model function. Pass the model to be saved and the file name and that’s it:

<h2 style=color:green align="left"> save the model </h2>
classification.save_model(classification_dt, 'decision_tree_1')

- We can load this model later on and predict labels on the unseen data:

In [None]:
save_model(xgboost, 'abc')

<h2 style=color:green align="left"> load model </h2>
dt_model = classification.load_model(model_name='decision_tree_1')

In [None]:
l = load_model('abc')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 10.12) Deploy Model </h1>

deploy_model(xgboost, model_name='xgboost-for-aws', authentication={'bucket':'pycaret-test'})