In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Pycaret** - It's a low code machine learning library that automates workflow, which helps in replacing multiple lines of code with few lines. It further speeds up the training, execution in a fast and efficient way.

<h1 id='library'>
Libraries
<a class="anchor-link" href="https://www.kaggle.com/jagunn/pycaret-tbs-july/notebook#library">¶</a>
</h1>

In [None]:
#pip install pycaret

In [None]:
import pandas as pd
import numpy as np

from pycaret.regression import setup, compare_models, create_model, blend_models, \
tune_model, finalize_model, predict_model, plot_model

import warnings
warnings.filterwarnings('ignore')

<h1 id='dataload'>
Data Loading
<a class="anchor-link" href="https://www.kaggle.com/jagunn/pycaret-tbs-july/notebook#dataload">¶</a>
</h1>

In [None]:
dataset = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')
sub_sample = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv')

<h1 id='process'>
Processing
<a class="anchor-link" href="https://www.kaggle.com/jagunn/pycaret-tbs-july/notebook#process">¶</a>
</h1>

* _No null values found in dataset._
* _Date time variable - New features added_

In [None]:
dataset['date_time'] = pd.to_datetime(dataset['date_time'])
dataset['year'] = dataset['date_time'].dt.year
dataset['month'] = dataset['date_time'].dt.month
dataset['day'] = dataset['date_time'].dt.day
dataset['dayofweek'] = dataset['date_time'].dt.dayofweek
dataset['hour'] = dataset['date_time'].dt.hour
dataset['date_time'] = dataset['date_time'].apply(lambda x:pd.Timestamp(x).value)

test['date_time'] = pd.to_datetime(test['date_time'])
test['year'] = test['date_time'].dt.year
test['month'] = test['date_time'].dt.month
test['day'] = test['date_time'].dt.day
test['dayofweek'] = test['date_time'].dt.dayofweek
test['hour'] = test['date_time'].dt.hour
test['date_time'] = test['date_time'].apply(lambda x:pd.Timestamp(x).value)

In [None]:
print(dataset.shape, test.shape)

_Three seperate features and target variable for intial setup of Pycaret._

In [None]:
t3 = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
target_drop_4_cm = ['target_benzene', 'target_nitrogen_oxides']
target_drop_4_b = ['target_carbon_monoxide', 'target_nitrogen_oxides']
target_drop_4_no = ['target_carbon_monoxide', 'target_benzene']
X_cm = dataset.drop(target_drop_4_cm,axis=1)
X_b = dataset.drop(target_drop_4_b,axis=1)
X_no = dataset.drop(target_drop_4_no,axis=1)

<h1 id='ms'>
Pycaret - model selection
<a class="anchor-link" href="https://www.kaggle.com/jagunn/pycaret-tbs-july/notebook#ms">¶</a>
</h1>

_Clubbed individual steps of Pycaret into a utility function,verbose level set to 'false' as to avoid the tabular columns showing the metrics from each step to keep the notebook simpler._

In [None]:
def pycaret_predict(x, y, seed, test):

    ''' setup() - Initialize the environment in pycaret and creates the transformation pipeline for data modeling and deployment. It prints the information grid which contain details of dataset like shape,missing values, numeric / categoric features. '''
    
    setup(data=x, target=y, session_id=seed, silent=True, verbose=False, profile=True,normalize = True, normalize_method='robust', fold_strategy='timeseries', fold=7, fold_shuffle=True)
    
    
    ''' compare_models() - Comparing all the models, train and scores using k-fold validation.Output score grid shows average of MAE, MSE, RMSE, R2, RMSLE and MAPE. It returns the best performing model, can be customized with the help of n_select param to return a list of additional models which helps in case of blend or stack. '''
    
    top3 = compare_models(n_select=3, verbose=False)
    

    ''' blend_model() - Ensembling can be done using blend model. Voting Regressor helps intern to return the best model for predicting the data. '''
        
    blended_model = blend_models(top3, verbose=False)
    
    print(blended_model.estimators_)
    
    ''' create_model() - Creates the specific model by passing the ID of the model from the Pycaret library of models, can be identified using the 'models()' function which list all the models available. eg : CatBosst Regressor - 'catboost' '''

    #cboost = create_model('catboost', verbose=False)
    
    model = create_model(blended_model, verbose=False)
    
    ''' tune_model() - Create model uses default hyperparameter to train a model. In order to further tune the model for better results tune_model() is used, it uses Random Grid Search to evaluate hyperparameter. '''
    
    tuned_model = tune_model(model, verbose=False)
    
    ''' finalize_model() - Last step in freezing the model, this will fit the entire dataset into the model and train it. '''

    final_model = finalize_model(tuned_model)
    
    ''' predict_model() - Predict the training / test data. As it holds entire dataset of training, we can simply call by passing the finalize model object for training the dataset. Test / Unseen data can be predicted by using the parameter 'data'. '''
    
    # predict the training data
    predict_model(final_model)
    
    # predict the test data
    return final_model, predict_model(final_model, data=test)

<h1 id='p'>
Prediction
<a class="anchor-link" href="https://www.kaggle.com/jagunn/pycaret-tbs-july/notebook#p">¶</a>
</h1>


_Prediction for target - **Carbon Monoxide**_

In [None]:
cm_model, cm_pred = pycaret_predict(X_cm, t3[0], 123, test)

In [None]:
# Residual plot for Carbon Monoxide

plot_model(cm_model)

_Prediction for target - **Benzene**_

In [None]:
b_model, b_pred = pycaret_predict(X_b, t3[1], 123, test)

In [None]:
# Residual plot for Benzene

plot_model(b_model)

_Prediction for target - **Nitrogen Oxide**_

In [None]:
no_model, no_pred = pycaret_predict(X_no, t3[2], 123, test)

In [None]:
# Residual plot for Nitrogen Oxide

plot_model(no_model)

<h1 id='s'>
Submission
<a class="anchor-link" href="https://www.kaggle.com/jagunn/pycaret-tbs-july/notebook#s">¶</a>
</h1>

_Target variable creation and submission by combining all the predictions._

In [None]:
frames = [pd.DataFrame({'CM':cm_pred['Label']}), pd.DataFrame({'B':b_pred['Label']}), pd.DataFrame({'NO':no_pred['Label']})]
target = pd.concat(frames, ignore_index = False, axis = 1)

#target
#sub_sample.head()
sub_sample.iloc[:, 1:] = target
sub_sample.head()
sub_sample.to_csv('pycaret_blend.csv', index=False)

### **Reference**

Installation : https://pycaret.readthedocs.io/en/latest/installation.html

Tutorial : https://pycaret.readthedocs.io/en/latest/tutorials.html