In [None]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import TransformedTargetRegressor

In [None]:
# Matplotlib settings
matplotlib.rc("axes.spines", right=False, top=False)
matplotlib.rc("figure", figsize=(12, 12))
matplotlib.rc("font", family="serif")

# Ignore non-critical warnings
import warnings
warnings.simplefilter("ignore")

In [None]:
file_path = "/kaggle/input/tabular-playground-series-jul-2021/"
train = pd.read_csv(f"{file_path}train.csv", index_col="date_time")
test = pd.read_csv(f"{file_path}test.csv", index_col="date_time")
sample_submission = pd.read_csv(f"{file_path}sample_submission.csv", index_col="date_time")

In [None]:
train.head()

In [None]:
test.head()

# 1. Basic Exploratory Data Analysis

### 1.1 Check for missing values

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### 1.2 Histograms

In [None]:
_ = train.hist(bins=20)

Most of the *independent variables (predictors)* are **relatively normally distributed**, which is good.

The *target variables (dependent variables)* appear **skewed to the right**. This can be counteracted using *tranformations* e.g. finding their *logarithms* or *square-roots*.

## 1.3 Box-plots

In [None]:
_ = train.plot(kind="box", layout=(3, 4), subplots=True)

Majority of the *sensor-data* and the *target variables* have **numerous outliers**. This is not surprising, considering that these variables exhibit skewness.

### 1.4 Correlation matrix

In [None]:
_ = sns.heatmap(train.corr(), annot=True, center=0, cmap="coolwarm", square=True)

The sensors exhibit relatively high correlation among themselves.

# 2. Modelling & Prediction

In [None]:
X = train.iloc[:, :-3]
y = train.iloc[:, -3:]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [None]:
def fit_and_score_model(estimator, params):
    """Search for optimum hyper-parameters of the given estimator from the
    supplied params.

    Parameters
    ----------
    estimator : estimator object
        An estimator object implementing "fit" and "predict".
    params : dict
        Parameter distributions.

    Returns
    -------
    An estimator object with the hyper-parameters that yielded the best score.
    """
    model_search = GridSearchCV(
        estimator,
        param_grid=params,
        scoring="neg_mean_squared_log_error",
        cv=4,
        n_jobs=4
    )
    # Apply a log tranformation on target variables to counteract skewness
    model = TransformedTargetRegressor(
        regressor=model_search, func=np.log1p, inverse_func=np.expm1
    )
    model.fit(X_train, y_train)
    test_score = mean_squared_log_error(model.predict(X_test), y_test)

    print(f"Best Score: {model.regressor_.best_score_}")
    print(f"Test Score: {test_score}")
    print(f"Best Params: {model.regressor_.best_params_}")

    return model


# # Un-comment the below code to perform a grid-search for hyper-parameters.
# # Be warned: it can take quite a long time.
#
# et = ExtraTreesRegressor(random_state=3)
# params = dict(
#     n_estimators=range(100, 1001, 100),
#     criterion=["mae", "mse"],
#     max_depth=[None, 3, 5, 7],
#     min_samples_split=[2, 5, 0.001]
# )
# et_model = fit_and_score_model(et, params)
# et_predictions = et_model.predict(test)
# et_predictions[:5]

In [None]:
# Apply a log transformation when fitting to counteract skewness
et_model = TransformedTargetRegressor(
    regressor=ExtraTreesRegressor(
        n_estimators=1000, criterion="mae", min_samples_split=0.001, n_jobs=4
    ),
    func=np.log1p,
    inverse_func=np.expm1
)
et_model.fit(X_train, y_train)

print(f"""\
Train score: {mean_squared_log_error(et_model.predict(X_train), y_train)}
Test score: {mean_squared_log_error(et_model.predict(X_test), y_test)}
""")
et_predictions = et_model.predict(test)
et_predictions[:5]

In [None]:
# Create submission file
sample_submission.iloc[:, :] = et_predictions
sample_submission.to_csv("submission.csv")