In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.metrics import mean_squared_log_error

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv", low_memory=False)#, nrows=10000)
train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv", low_memory=False)
test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
train.head(10)

# **EDA**
Thanks to: @Maxim Kazantsev https://www.kaggle.com/maximkazantsev/tps-07-21-eda-catboost

In [None]:
targets = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]
target_names = ["Carbon monoxide", "Benzene", "Nitrogen oxides"]

In [None]:
def make_new_features(df):
    df["month"] = df["date_time"].dt.month
    df["day_of_week"] = df["date_time"].dt.dayofweek
    df["day_of_year"] = df["date_time"].dt.dayofyear
    df["hour"] = df["date_time"].dt.hour
    df["quarter"] = df["date_time"].dt.quarter
    df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
    df["working_hours"] =  df["hour"].isin(np.arange(8, 21, 1)).astype("int")
    df["is_weekend"] = (train["date_time"].dt.dayofweek >= 5).astype("int")
    return df

In [None]:
train_copy = train.copy()
test_copy = test.copy()
train = make_new_features(train)
test = make_new_features(test)

The datasets have timestamps. Let's compare which dates are in each dataset.

In [None]:
# Plot dataframe
df = pd.concat([train["date_time"], test["date_time"]], axis=0).reset_index(drop=True)

fig, ax = plt.subplots(figsize=(16, 1.5))
bar1 =  ax.barh(0, 7111+2247, color="salmon", height=0.2)
bar2 =  ax.barh(0, 7111, color="teal", height=0.2)
ax.set_title("Train and test datasets size comparison", fontsize=20, pad=5)
ax.bar_label(bar1, ["Test dataset"], label_type="edge", padding=-170,
             fontsize=20, color="white", weight="bold")
ax.bar_label(bar2, ["Train dataset"], label_type="center",
             fontsize=20, color="white", weight="bold")
ax.set_xticks([0, 7111, 7111+2247])
ax.set_xticklabels(["2010-03-10", "2011-01-01", "2011-04-04"])
ax.set_yticks([])
plt.show();

## Target plots

The datasets also have three target columns that the model have to predict. Let's see how each target is changing in time.

In [None]:
fig, axs = plt.subplots(figsize=(16, 18), ncols=1, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.3)

colors = ["palevioletred", "deepskyblue", "mediumseagreen"]

for i in [0, 1, 2]:
    axs[i].plot(train["date_time"], train[targets[i]], color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1}) levels across time", fontsize=20, pad=5)
    axs[i].set_ylabel(f"{target_names[i]} level", fontsize=14, labelpad=5)
    axs[i].set_xlabel("Date", fontsize=14, labelpad=5)
    axs[i].grid(axis="both")

plt.show();

Let's see mean target values per day of year.

In [None]:
# Dataframe copy excluding the last row which is the only one representing January
df = train.drop([7110], axis=0).copy()
df["day"] = df["date_time"].dt.dayofyear
df["weekday"] = df["date_time"].dt.dayofweek

colors = ["palevioletred", "deepskyblue", "mediumseagreen"]

# An array of number of days of year (i.e. from 1 to 365) which are mondays to mark week starts
mondays = df.loc[df["weekday"] == 0]["day"].value_counts(sort=False).index
# An array of number of weeks of year to be used as label ticks
weeks = df["date_time"].dt.isocalendar().week.unique()[1:]

fig, axs = plt.subplots(figsize=(16, 18), ncols=1, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.3)

for i in [0, 1, 2]:
    axs[i].plot(df.groupby("day")[targets[i]].mean().index,
                df.groupby("day")[targets[i]].mean().values, color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1}) mean levels across time", fontsize=20, pad=5)
    axs[i].set_ylabel(f"{target_names[i]} level", fontsize=14, labelpad=5)
    axs[i].set_xlabel("Week starts", fontsize=14, labelpad=5)
    axs[i].set_xticks(mondays)
    axs[i].set_xticklabels(weeks)
    axs[i].grid(axis="both")

plt.show();

As you can see, all target values usually go down at the end of each week (i.e. during weekends). 

Let's check targets distribution along each month.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.subplots_adjust(hspace = 0.3)
fig.suptitle(target_names[0], fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(train.loc[train["month"]==i, targets[0]], color="steelblue")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.subplots_adjust(hspace = 0.3)
fig.suptitle(target_names[1], fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(train.loc[train["month"]==i, targets[1]], color="palevioletred")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

There are some near zer flat areas at 4th, 6th, 8th, 12th month plots. Need to figure out what is so special about these days. It also may be a garbage data which sould be deleted before machine learning.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)
fig.suptitle(target_names[2], fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(train.loc[train["month"]==i, targets[2]], color="goldenrod")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

Let's check each target value distribution.

In [None]:
fig, axs = plt.subplots(figsize=(15, 6), ncols=3, nrows=1, sharey=False)

fig.suptitle("Target values distribution", fontsize=20)

colors = ["mediumorchid", "lightseagreen", "cornflowerblue"]

for i in [0, 1, 2]:
    axs[i].hist(train[targets[i]], bins=60, edgecolor="black", color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=5)
    axs[i].set_ylabel("Amount of values", fontsize=13, labelpad=5)
    axs[i].set_xlabel(f"{target_names[i]} level", fontsize=13, labelpad=5)
    axs[i].grid(axis="y")

plt.show();

Let's check how each target value chenges depending on the time of day, day of week, and month.

In [None]:
fig, axs = plt.subplots(figsize=(16, 18), ncols=1, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.3)
width=0.35
x = train.groupby("hour")["target_carbon_monoxide"].mean().index

for i in np.arange(3):
    bars1 = axs[i].bar(x-width/2, train.groupby("hour")[targets[i]].mean(),
                        width=width, edgecolor="black", label="Mean", color="cornflowerblue")
    bars2 = axs[i].bar(x+width/2, train.groupby("hour")[targets[i]].median(),
                        width=width, edgecolor="black", label="Median", color="palevioletred")
    axs[i].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i].set_xlabel("Day hours", fontsize=13, labelpad=5)
    axs[i].set_xticks(x)
    axs[i].grid(axis="y")
    axs[i].legend(fontsize=13)


In [None]:
# Dataframe copy excluding the last row which is the only one representing January
df = train.drop([7110], axis=0).copy()

fig, axs = plt.subplots(figsize=(16, 19), ncols=2, nrows=3, sharex=False,
                        gridspec_kw={'width_ratios': [1, 1.5]})

fig.suptitle("Target values distribution per month and day of week", fontsize=20)

plt.subplots_adjust(hspace = 0.25)
width=0.35
x = df.groupby("day_of_week")["target_carbon_monoxide"].mean().index + 1

for i in np.arange(3):
    bars1 = axs[i, 0].bar(x-width/2, df.groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="Mean", color="salmon")
    bars2 = axs[i, 0].bar(x+width/2, df.groupby("day_of_week")[targets[i]].median(),
                        width=width, edgecolor="black", label="Median", color="teal")
    axs[i, 0].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 0].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 0].set_xlabel("Day of week", fontsize=13, labelpad=5)
    axs[i, 0].set_xticks(x)
    axs[i, 0].grid(axis="y")
    axs[i, 0].legend(fontsize=13)

x = df.groupby("month")["target_carbon_monoxide"].mean().index
for i in np.arange(3):
    bars1 = axs[i, 1].bar(x-width/2, df.groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="Mean", color="salmon")
    bars2 = axs[i, 1].bar(x+width/2, df.groupby("month")[targets[i]].median(),
                        width=width, edgecolor="black", label="Median", color="teal")
    axs[i, 1].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 1].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 1].set_xlabel("Month", fontsize=13, labelpad=5)
    axs[i, 1].set_xticks(x)
    axs[i, 1].grid(axis="y")
    axs[i, 1].legend(fontsize=13)


In [None]:
# Day hours which will be used for plotting data
hours = [0, 5, 8, 14, 19]
# Dataframe copy excluding the last row which is the only one representing January
df = train.loc[train["hour"].isin(hours)].drop([7110], axis=0).copy()

fig, axs = plt.subplots(figsize=(16, 18), ncols=2, nrows=3, sharex=False,
                        gridspec_kw={'width_ratios': [1, 1.5]})

fig.suptitle("Target values distribution per month and day of week at given hours", fontsize=20)

plt.subplots_adjust(hspace = 0.3)
width=0.15
x = np.sort(df["day_of_week"].unique()) + 1

for i in np.arange(3):
    bars1 = axs[i, 0].bar(x-width*2, df.loc[df["hour"] == 0].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="00:00", color="salmon")
    bars2 = axs[i, 0].bar(x-width, df.loc[df["hour"] == 5].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="05:00", color="sandybrown")
    bars3 = axs[i, 0].bar(x, df.loc[df["hour"] == 8].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="08:00", color="teal")
    bars4 = axs[i, 0].bar(x+width, df.loc[df["hour"] == 14].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="14:00", color="palevioletred")
    bars5 = axs[i, 0].bar(x+width*2, df.loc[df["hour"] == 19].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="19:00", color="mediumslateblue")
    axs[i, 0].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 0].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 0].set_xlabel("Day of week", fontsize=13, labelpad=5)
    axs[i, 0].set_xticks(x)
    axs[i, 0].grid(axis="y")
    axs[i, 0].legend(fontsize=10)

x = df["month"].unique()
for i in np.arange(3):
    bars1 = axs[i, 1].bar(x-width*2, df.loc[df["hour"] == 0].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="00:00", color="salmon")
    bars2 = axs[i, 1].bar(x-width, df.loc[df["hour"] == 5].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="05:00", color="sandybrown")
    bars3 = axs[i, 1].bar(x, df.loc[df["hour"] == 8].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="08:00", color="teal")
    bars4 = axs[i, 1].bar(x+width, df.loc[df["hour"] == 14].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="14:00", color="palevioletred")
    bars5 = axs[i, 1].bar(x+width*2, df.loc[df["hour"] == 19].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="19:00", color="mediumslateblue")
    axs[i, 1].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 1].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 1].set_xlabel("Month", fontsize=13, labelpad=5)
    axs[i, 1].set_xticks(x)
    axs[i, 1].grid(axis="y")
    axs[i, 1].legend(fontsize=10)

## Feature plots

In [None]:
# Lists of feature names to be used for plots below
all_features = ["deg_C", "relative_humidity", "absolute_humidity", "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]
all_feature_names = ["Temperature (deg. C)", "Relative humidity", "Absolute humidity", "Sensor 1", "Sensor_2", "Sensor 3", "Sensor 4", "Sensor 5"]

weather_features = ["deg_C", "relative_humidity", "absolute_humidity"]
weather_feature_names = ["Temperature (deg. C)", "Relative humidity", "Absolute humidity"]

sensor_features = ["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]
sensor_feature_names = ["Sensor 1", "Sensor_2", "Sensor 3", "Sensor 4", "Sensor 5"]

Let's compare our train and test feature data.

In [None]:
fig, axs = plt.subplots(figsize=(16, 30), ncols=1, nrows=8, sharex=False)

plt.subplots_adjust(hspace = 0.4)

colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

for i in np.arange(8):
    legend_lines = [Line2D([0], [0], color=colors[i], lw=10),
                    Line2D([0], [0], color="black", lw=10)]
    axs[i].plot(train["date_time"], train[all_features[i]], color=colors[i], label="Train data")
    axs[i].plot(test["date_time"], test[all_features[i]], color="black", label="Test data")
    axs[i].set_title(f"{all_feature_names[i]} levels across time", fontsize=20, pad=5)
    axs[i].set_ylabel(f"{all_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i].set_xlabel("Date", fontsize=14, labelpad=5)
    axs[i].legend(legend_lines, ["Train data", "Test data"], fontsize=12, loc=1)
    axs[i].grid(axis="both")

In [None]:
# Plot dataframe creation
df = pd.concat([train_copy, test_copy], axis=0)
df.reset_index(drop=True, inplace=True)
df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
df["day_of_year"] = df["date_time"].dt.dayofyear

fig, axs = plt.subplots(figsize=(16, 18), ncols=2, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.4)

colors = ["palevioletred", "deepskyblue", "mediumseagreen"]

for i in [0, 1, 2]:
    # New year days start from 7110th row
    data = df.iloc[:7110].groupby("day_of_year")[weather_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7110:].groupby("day_of_year")[weather_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 0].set_title(f"Mean dayly {weather_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 0].set_ylabel(f"{weather_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 0].set_xlabel("Day of year", fontsize=14, labelpad=5)
    axs[i, 0].grid(axis="both")
    axs[i, 0].legend(fontsize=12)


for i in [0, 1, 2]:
    # New year weeks start from 7159th row. 
    # Because of Jan 1st and 2nd from the test dataset are counted as 52nd week of 2010,
    # the colored plotline contains some test data. 
    data = df.iloc[:7159].groupby("week_of_year")[weather_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7159:].groupby("week_of_year")[weather_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 1].set_title(f"Mean weekly {weather_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 1].set_ylabel(f"{weather_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 1].set_xlabel("Week of year", fontsize=14, labelpad=5)
    axs[i, 1].grid(axis="both")
    axs[i, 1].legend(fontsize=12)

plt.show();

In [None]:
# Plot dataframe creation
df = pd.concat([train_copy, test_copy], axis=0)
df.reset_index(drop=True, inplace=True)
df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
df["day_of_year"] = df["date_time"].dt.dayofyear

fig, axs = plt.subplots(figsize=(16, 30), ncols=2, nrows=5, sharex=False)

plt.subplots_adjust(hspace = 0.4)

colors = ["palevioletred", "deepskyblue", "mediumseagreen", "goldenrod", "indianred"]

for i in np.arange(5):
    data = df.iloc[:7110].groupby("day_of_year")[sensor_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7110:].groupby("day_of_year")[sensor_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 0].set_title(f"Mean dayly {sensor_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 0].set_ylabel(f"{sensor_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 0].set_xlabel("Day of year", fontsize=14, labelpad=5)
    axs[i, 0].grid(axis="both")
    axs[i, 0].legend(fontsize=12)


for i in np.arange(5):
    data = df.iloc[:7159].groupby("week_of_year")[sensor_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7159:].groupby("week_of_year")[sensor_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 1].set_title(f"Mean dayly {sensor_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 1].set_ylabel(f"{sensor_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 1].set_xlabel("Week of year", fontsize=14, labelpad=5)
    axs[i, 1].grid(axis="both")
    axs[i, 1].legend(fontsize=12)

plt.show();

#### ===To be continued===

# **Machine learning**

The datetime conversion shown below was found in this [notebook](https://www.kaggle.com/jarupula/eda-rf-model-tps-july-21).

In [None]:
# Feature engineer
## Train
train_copy['hr'] = train_copy.date_time.dt.hour*60+train_copy.date_time.dt.minute
train_copy['day'] =train_copy.date_time.dt.weekday//5
train_copy['satday'] = train_copy.date_time.dt.weekday==5
train_copy['hr1'] = train_copy.date_time.dt.hour*60+train_copy.date_time.dt.minute
## Test
test_copy['hr'] = test_copy.date_time.dt.hour*60+test_copy.date_time.dt.minute
test_copy['day'] =test_copy.date_time.dt.weekday//5
test_copy['satday'] = test_copy.date_time.dt.weekday==5
test_copy['hr1'] = test_copy.date_time.dt.hour*60+test_copy.date_time.dt.minute

In [None]:
preds = pd.DataFrame()
preds["date_time"] = test["date_time"].copy()

train_copy['date_time'] = train_copy['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
test_copy['date_time'] = test['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

In [None]:
# Dropping the last row as noise
X = train_copy.drop([7110], axis=0)

targets = X[["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]].copy()
X.drop(["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"], axis=1, inplace=True)#, "date_time"], axis=1, inplace=True)
y = np.log1p(targets)

X_test = test_copy.copy()#.drop("date_time", axis=1)

print('X_train shape:', X.shape)
print('y_train shape:', y.shape)
print('X_test shape:', X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

## XGBoost(回歸器)

In [None]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor

# 建立xgbrModel模型
xgbrModel=xgb.XGBRegressor(n_estimators=100)
# 使用訓練資料訓練模型
model = MultiOutputRegressor(xgbrModel).fit(X,y)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
pred=model.predict(X)
print("RMSE: ",sqrt(mean_squared_error(y,pred)))
print("Score: ",model.score(X,y))

## 預測輸出
輸出 Y log1p()對數去偏 - 使用自然對數去除偏態，先加1再取對數，還原時先取指數後再減1。對於可能出現等於零的資料使用。

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
y_pred=model.predict(X_test)
y_pred=np.exp(y_pred)-1
sub['target_carbon_monoxide'] = y_pred[:,0]
sub['target_benzene'] = y_pred[:,1]
sub['target_nitrogen_oxides'] = y_pred[:,2]
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)