# FLAML - Fast and Lightweight AutoML

It's a AutoML framework from Microsoft.

Github Link: [FLAML](https://github.com/microsoft/FLAML)

In this notebook, I tried to follow their example notebook using this Playground Competition dataset.

## Importing Packages

In [None]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style("darkgrid")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams["figure.figsize"] = 8, 5

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

In [None]:
!pip install flaml[notebook];

## Defining File Paths

In [None]:
DATASET_DIR = '/kaggle/input/tabular-playground-series-jul-2021/'
TRAIN_CSV = os.path.join(DATASET_DIR, 'train.csv')
TEST_CSV = os.path.join(DATASET_DIR, 'test.csv')
SAMPLE_SUBMISSION_CSV = os.path.join(DATASET_DIR, 'sample_submission.csv')

## Preparing Dataset

In [None]:
dataset = pd.read_csv(TRAIN_CSV)
dataset.head()

In [None]:
def get_processed_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    """Extracting few informations from date_time column. Other feature extractiong code can go under this function."""
    dataset_copy = dataset.copy()
    dataset_copy['date_time'] = pd.to_datetime(dataset_copy['date_time'])
    dataset_copy['month'] = dataset_copy['date_time'].dt.month
    dataset_copy['day'] = dataset_copy['date_time'].dt.day
    dataset_copy['day_of_week'] = dataset_copy['date_time'].dt.dayofweek        
    
    # Winter – December, January and February. 
    # Spring – March, April and May. 
    # Summer – June, July and August.
    # Autumn – September, October and November.    
    dataset_copy['winter_season'] = dataset_copy['month'].apply(lambda x: 1 if x in [12, 1, 2] else 0)
    dataset_copy['spring_season'] = dataset_copy['month'].apply(lambda x: 1 if x in [3, 4, 5] else 0)
    dataset_copy['summer_season'] = dataset_copy['month'].apply(lambda x: 1 if x in [6, 7, 8] else 0)
    dataset_copy['autumn_season'] = dataset_copy['month'].apply(lambda x: 1 if x in [9, 10, 11] else 0)
    
    return dataset_copy

In [None]:
dataset_copy = get_processed_dataset(dataset)

In [None]:
target_columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

excluded_feature_columns = ['date_time'] + target_columns

feature_columns = [column_name for column_name in dataset_copy.columns if column_name not in excluded_feature_columns]

In [None]:
# Splitting dataset into train-test
X_train, X_test, y_train, y_test = train_test_split(dataset_copy[feature_columns], 
                                                    dataset_copy[target_columns], 
                                                    test_size=0.20, 
                                                    random_state=42)

## Helper Functions

In [None]:
def root_mean_squared_log_error(y_true: np.array, y_pred: np.array) -> float:
    return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None)))

## Train with FLAML

> NOTE: I tried to create predictor for every target variables.

In [None]:
# Importing AutoML from flaml

from flaml import AutoML

### Training Model for `target_carbon_monoxide`

In [None]:
carbon_monoxide_predictor = AutoML()

settings = {    
    "metric": 'mse', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['lgbm', 'xgboost', 'catboost', 'rf'], # list of ML learners; we tune xgboost in this example
    "task": 'regression', # task type    
    "log_file_name": 'carbon_monoxide_predictor.log',  # flaml log file
}

In [None]:
carbon_monoxide_predictor.fit(X_train=X_train, y_train=y_train['target_carbon_monoxide'], **settings)

#### Retrieve Best Config

In [None]:
print('Best estimator:', carbon_monoxide_predictor.best_estimator)
print('Best hyperparmeter config:', carbon_monoxide_predictor.best_config)
print('Training duration of best run: {0:.4g} s'.format(carbon_monoxide_predictor.best_config_train_time))

#### Compute Predictions of Testing Dataset

In [None]:
y_pred = carbon_monoxide_predictor.predict(X_test)

rmsle = root_mean_squared_log_error(y_test['target_carbon_monoxide'], y_pred)
mse = mean_squared_error(y_test['target_carbon_monoxide'], y_pred)

print(f"MSE: {mse} || RMSLE: {rmsle}")

## Training Model for target_benzene

In [None]:
benzene_predictor = AutoML()

settings = {    
    "metric": 'mse', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['lgbm', 'xgboost', 'catboost', 'rf'], # list of ML learners;
    "task": 'regression', # task type    
    "log_file_name": 'benzene_predictor.log',  # flaml log file
}

In [None]:
benzene_predictor.fit(X_train=X_train, y_train=y_train['target_benzene'], **settings)

#### Retrieve Best Config

In [None]:
print('Best estimator:', benzene_predictor.best_estimator)
print('Best hyperparmeter config:', benzene_predictor.best_config)
print('Training duration of best run: {0:.4g} s'.format(benzene_predictor.best_config_train_time))

#### Compute Predictions of Testing Dataset

In [None]:
y_pred = benzene_predictor.predict(X_test)

rmsle = root_mean_squared_log_error(y_test['target_benzene'], y_pred)
mse = mean_squared_error(y_test['target_benzene'], y_pred)

print(f"MSE: {mse} || RMSLE: {rmsle}")

## Training Model for target_nitrogen_oxides

In [None]:
nitrogen_oxides_predictor = AutoML()

settings = {    
    "metric": 'mse', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['lgbm', 'xgboost', 'catboost', 'rf'], # list of ML learners;
    "task": 'regression', # task type    
    "log_file_name": 'nitrogen_oxides_predictor.log',  # flaml log file
}

In [None]:
nitrogen_oxides_predictor.fit(X_train=X_train, y_train=y_train['target_nitrogen_oxides'], **settings)

#### Retrieve Best Config

In [None]:
print('Best estimator:', nitrogen_oxides_predictor.best_estimator)
print('Best hyperparmeter config:', nitrogen_oxides_predictor.best_config)
print('Training duration of best run: {0:.4g} s'.format(nitrogen_oxides_predictor.best_config_train_time))

#### Compute Predictions of Testing Dataset

In [None]:
y_pred = nitrogen_oxides_predictor.predict(X_test)

rmsle = root_mean_squared_log_error(y_test['target_nitrogen_oxides'], y_pred)
mse = mean_squared_error(y_test['target_nitrogen_oxides'], y_pred)

print(f"MSE: {mse} || RMSLE: {rmsle}")

## Preparing Submission file

In [None]:
test_dataset = pd.read_csv(TEST_CSV)
test_dataset.head()

In [None]:
test_dataset = get_processed_dataset(test_dataset)

test_dataset.head()

### Getting predictions from Trained Models

In [None]:
carbon_monoxide_pred = carbon_monoxide_predictor.predict(test_dataset[feature_columns])
benzene_pred = benzene_predictor.predict(test_dataset[feature_columns])
nitrogen_oxides_pred = nitrogen_oxides_predictor.predict(test_dataset[feature_columns])

In [None]:
submission_df = pd.DataFrame({"date_time": test_dataset['date_time'],
                             "target_carbon_monoxide": carbon_monoxide_pred,
                             "target_benzene": benzene_pred,
                             "target_nitrogen_oxides": nitrogen_oxides_pred})
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv",index=False)