# AutoML with PyCaret

In [None]:
# enable autocomplete in notebook
%config Completer.use_jedi = False

In [None]:
!pip install -q pycaret[full]

In [None]:
from pycaret.regression import *
import pandas as pd
import featuretools as ft
import numpy as np # linear algebra
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv',parse_dates=['date_time'])
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv',parse_dates=['date_time'])
df_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
df_train.head()

# Feature Engineering

In [None]:
# Lets extract all useful information from date_time column
# https://medium.com/@swethalakshmanan14/simple-ways-to-extract-features-from-date-variable-using-python-60c33e3b0501
def process_date(df):
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['week'] = df['date_time'].dt.week
    df['day'] = df['date_time'].dt.day
    df['dayofweek'] = df['date_time'].dt.dayofweek
    df['hour'] = df['date_time'].dt.hour
    df.drop(columns = 'date_time', inplace = True)
    return df

In [None]:
df_train_processed = process_date(df_train)
df_test_processed = process_date(df_test)

In [None]:
df_train_processed

# Prepare for Model building

In [None]:
# Lets use PyCaret to build the models for us
# https://github.com/pycaret/pycaret/blob/master/tutorials/Regression%20Tutorial%20Level%20Intermediate%20-%20REG102.ipynb
def build_my_models(train, target_col, test_set):
    setup(data = train, target = target_col, session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95, #fold_strategy= 'timeseries',
                  log_experiment = False, silent=True)
    base_models = compare_models(exclude = ['ransac'],fold = 5, n_select = 3)
    #Tune and blend the models for better performance
    tunned_model = [tune_model(model) for model in base_models]
    print("tunning completed.")
    blended_model = blend_models(estimator_list=tunned_model, fold=5, optimize='RMSLE')
    print("Blending completed.")
    # Preidict the model against holdout set - https://pycaret.org/predict-model/
    holdout_prediction = predict_model(blended_model)
    # Lets finialize the model
    final_model = finalize_model(blended_model)
    #Lets predict for the test set
    predictions = predict_model(final_model, data = test_set)
    return predictions['Label']

## First model - Predict target_carbon_monoxide	

In [None]:
df_train_target_carbon_monoxide = df_train_processed.drop(columns=['target_benzene','target_nitrogen_oxides'])
a = build_my_models(train=df_train_target_carbon_monoxide, target_col='target_carbon_monoxide', test_set=df_test_processed)
df_submission['target_carbon_monoxide']= a

# Second model - target_benzene

In [None]:
df_train_target_benzene = df_train_processed.drop(columns=['target_carbon_monoxide','target_nitrogen_oxides'])
b = build_my_models(train=df_train_target_benzene, target_col='target_benzene', test_set=df_test_processed)
df_submission['target_benzene']=b

# Third Model - target_nitrogen_oxides

In [None]:
df_train_target_nitrogen_oxides = df_train_processed.drop(columns=['target_carbon_monoxide','target_benzene'])
c = build_my_models(train=df_train_target_nitrogen_oxides, target_col='target_nitrogen_oxides', test_set=df_test_processed)
df_submission['target_nitrogen_oxides'] = c

In [None]:
df_submission.to_csv('second_submission.csv', index=False)

In [None]:
print("All Saved.....")