# TPS March 2022  
- Make baseline model using LightGBM  
- Feature engineering: lag features, rolling features, one-hot encoding  

## Import Library  

In [None]:
import os
import gc
import warnings
from typing import List
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, RobustScaler
import lightgbm as lgb
import optuna

pd.set_option("display.max_columns", 100)
warnings.filterwarnings("ignore")
optuna.logging.disable_default_handler()  # Don't display optuna log
%matplotlib inline

## Read Data  

In [None]:
DIR = "../input/tabular-playground-series-mar-2022"
train_df = pd.read_csv(os.path.join(DIR, "train.csv"))

random_state_list = np.random.randint(0, 1e+4, size=8)

## Feature Engineering   
- make new features  
- lag features  
- moving average/std  
- one-hot-encoding  

### Functions for feature engineering

In [None]:
def make_new_columns(df):
    df['region_xy'] = df['x'].astype(str) + df['y'].astype(str)
    df['region'] =\
        df['x'].astype(str) + df['y'].astype(str) + df['direction']

    return df

In [None]:
def make_lag_features(train_df, test_df=None):
    if test_df is not None:
        train_df["lag_1"] = 0
        test_df["lag_1"] = 0
        for i in [2, 3, 6, 72]:
            train_df[f"lag_{i}"] = 0
            train_df[f"lag_avg{i}"] = 0
            train_df[f"lag_std{i}"] = 0

            test_df[f"lag_{i}"] = 0
            test_df[f"lag_avg{i}"] = 0
            test_df[f"lag_std{i}"] = 0

        region_list = train_df["region"].unique()
        train_size = train_df.shape[0]
        lag_df = pd.DataFrame()
        for region in region_list:
            train_region_df = train_df.query("region == @region")
            test_region_df = test_df.query("region == @region")

            region_df = pd.concat((train_region_df, test_region_df)).reset_index(drop=True)
            region_df["lag_1"] = region_df["congestion"].shift(1)

            for i in [2, 3, 6, 72]:
                region_df[f"lag_{i}"] = region_df["congestion"].shift(i)
                region_df[f"lag_avg{i}"] = region_df["congestion"].rolling(window=i, min_periods=1).mean().shift(1)
                region_df[f"lag_std{i}"] = region_df["congestion"].rolling(window=i, min_periods=1).std().shift(1)

            lag_df = lag_df.append(region_df)

        lag_df = lag_df.sort_values(by="row_id").reset_index(drop=True)
        train_df, test_df = lag_df.iloc[:train_size], lag_df.iloc[train_size:]
        return train_df, test_df

    else:
        train_df["lag_1"] = 0
        for i in [2, 3, 6, 72]:
            train_df[f"lag_{i}"] = 0
            train_df[f"lag_avg{i}"] = 0
            train_df[f"lag_std{i}"] = 0

        region_list = train_df["region"].unique()
        lag_df = pd.DataFrame()
        for region in region_list:
            # regionごとにDataFrameを抽出
            region_df = train_df.query("region == @region")
            region_df["lag_1"] = region_df["congestion"].shift(1)

            for i in [2, 3, 6, 72]:
                region_df[f"lag_{i}"] = region_df["congestion"].shift(i)
                region_df[f"lag_avg{i}"] = region_df["congestion"].rolling(window=i, min_periods=1).mean().shift(1)
                region_df[f"lag_std{i}"] = region_df["congestion"].rolling(window=i, min_periods=1).std().shift(1)

            lag_df = lag_df.append(region_df)

        train_df = lag_df.sort_values(by="row_id").reset_index(drop=True)
        return train_df

In [None]:
def transform_times(df):
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].apply(lambda x: x.month)
    df['day'] = df['time'].apply(lambda x: x.day)
    df['hour'] = df['time'].apply(lambda x: x.hour)
    df['minute'] = df['time'].apply(lambda x: x.minute)

    # transform using trigonometric function
    month_list = [4, 6, 9]
    df[['day_sin', 'day_cos']] = 0
    for month in df['month'].unique():
        if month in month_list:
            df.loc[df['month'] == month, 'day_sin'] =\
                df.query("month == @month")['day'].apply(lambda x: np.sin((2*np.pi*x) / 30))
            df.loc[df['month'] == month, 'day_cos'] =\
                df.query("month == @month")['day'].apply(lambda x: np.cos((2*np.pi*x) / 30))
        else:
            df.loc[df['month'] == month, 'day_sin'] =\
                df.query("month == @month")['day'].apply(lambda x: np.sin((2*np.pi*x) / 31))
            df.loc[df['month'] == month, 'day_cos'] =\
                df.query("month == @month")['day'].apply(lambda x: np.cos((2*np.pi*x) / 31))

    df['month_sin'] = np.sin(2*np.pi*df['month'] / 12)
    df['month_cos'] = np.cos(2*np.pi*df['month'] / 12)
    df['hour_sin'] = np.sin(2*np.pi*df['hour'] / 24)
    df['hour_cos'] = np.cos(2*np.pi*df['hour'] / 24)
    df['minute_sin'] = np.sin(2*np.pi*df['minute'] / 60)
    df['minute_cos'] = np.cos(2*np.pi*df['minute'] / 60)

    df["weekday"] = df["time"].apply(lambda x: x.weekday())

    return df

In [None]:
def aggregation(train_df, test_df=None):
    train_df["region_weekday"] = train_df["region"] + train_df["weekday"].astype(str)

    region_mean_map = train_df.groupby("region")["congestion"].mean()
    region_median_map = train_df.groupby("region")["congestion"].median()
    region_std_map = train_df.groupby("region")["congestion"].std()

    region_weekday_mean_map = train_df.groupby("region_weekday")["congestion"].mean()
    region_weekday_median_map = train_df.groupby("region_weekday")["congestion"].median()
    region_weekday_std_map = train_df.groupby("region_weekday")["congestion"].std()

    train_df["region_target_mean"] = train_df["region"].map(region_mean_map)
    train_df["region_target_median"] = train_df["region"].map(region_median_map)
    train_df["region_target_std"] = train_df["region"].map(region_std_map)

    train_df["region_weekday_target_mean"] =\
        train_df["region_weekday"].map(region_weekday_mean_map)
    train_df["region_weekday_target_median"] =\
        train_df["region_weekday"].map(region_weekday_median_map)
    train_df["region_weekday_target_std"] =\
        train_df["region_weekday"].map(region_weekday_std_map)

    if test_df is not None:
        test_df["region_weekday"] = test_df["region"] + test_df["weekday"].astype(str)
        test_df["region_target_mean"] = test_df["region"].map(region_mean_map)
        test_df["region_target_median"] = test_df["region"].map(region_median_map)
        test_df["region_target_std"] = test_df["region"].map(region_std_map)

        test_df["region_weekday_target_mean"] =\
            test_df["region_weekday"].map(region_weekday_mean_map)
        test_df["region_weekday_target_median"] =\
            test_df["region_weekday"].map(region_weekday_median_map)
        test_df["region_weekday_target_std"] =\
            test_df["region_weekday"].map(region_weekday_std_map)

        return train_df, test_df

    return train_df

In [None]:
def expand_feature(train_df, test_df=None):
    expand_df = pd.DataFrame()
    train_size = train_df.shape[0]
    if test_df is not None:
        train_df["expand_congestion"] = 0
        test_df["expand_congestion"] = 0
        region_list = train_df["region"].unique()
        for region in region_list:
            train_region_df = train_df.query("region == @region")
            test_region_df = test_df.query("region == @region")
            train_region_size = train_region_df.shape[0]

            region_df = pd.concat((train_region_df, test_region_df)).reset_index(drop=True)
            region_df.loc[0, "expand_congestion"] = 0
            for i in range(1, train_region_size+1):  # テストデータの1個目まで作成
                region_df.loc[i, "expand_congestion"] = region_df.loc[:i-1, "congestion"].median()

            expand_df = expand_df.append(region_df)

        expand_df = expand_df.sort_values(by="row_id").reset_index(drop=True)
        train_df, test_df = expand_df.iloc[:train_size], expand_df.iloc[train_size:]
        return train_df, test_df

    else:
        train_df["expand_congestion"] = 0
        region_list = train_df["region"].unique()
        for region in region_list:
            region_df = train_df.query("region == @region").reset_index(drop=True)
            train_region_size = region_df.shape[0]

            region_df.loc[0, "expand_congestion"] = 0
            for i in range(1, train_region_size):
                region_df.loc[i, "expand_congestion"] = region_df.loc[:i-1, "congestion"].median()

            expand_df = expand_df.append(region_df)

        train_df = expand_df.sort_values(by="row_id").reset_index(drop=True)
        return train_df

In [None]:
def preprocessing(train_df, test_df=None):
    if test_df is None:
        # maek new columns
        train_df = make_new_columns(train_df)

        # make lag features and expanding feature
        train_df = make_lag_features(train_df)
        train_df = expand_feature(train_df)

        # transform time feature by using trigonometric function
        train_df = transform_times(train_df)

        # make aggrigation features
        train_df = aggregation(train_df)
        return train_df

    else:
        train_df = make_new_columns(train_df)
        test_df = make_new_columns(test_df)

        train_df, test_df = make_lag_features(train_df, test_df)
        train_df, test_df = expand_feature(train_df, test_df)

        train_df = transform_times(train_df)
        test_df = transform_times(test_df)

        train_df, test_df = aggregation(train_df, test_df)

        return train_df, test_df

In [None]:
%%time
trans_train = preprocessing(train_df)
trans_train

## evaluate cv score  

In [None]:
cat_columns = ['direction', 'region_xy', 'weekday']


num_columns = ['lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_72', 'lag_avg2',
               'lag_avg3', 'lag_avg6', 'lag_avg72', 'lag_std2', 'lag_std3',
               'lag_std6', 'lag_std72', 'month_sin', 'month_cos', 'day_sin',
               'day_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
               'region_target_mean', 'region_target_median', 'region_target_std',
               'region_weekday_target_mean', 'region_weekday_target_median',
               'region_weekday_target_std', 'expand_congestion']

use_columns = ['x', 'y']
target = ['congestion']

In [None]:
# one-hot encoding
dummy_columns = []
for column in cat_columns:
    tmp_train = pd.get_dummies(trans_train[column], prefix=column, drop_first=False)
    trans_train = pd.concat((trans_train, tmp_train), axis=1)

    dummy_columns.extend(tmp_train.columns.values.tolist())

# devide train data, validation data, and test data
train_df, val_df, test_df =\
    trans_train.iloc[:trans_train.shape[0]-4680, :],\
    trans_train.iloc[trans_train.shape[0]-4680:trans_train.shape[0]-2340, :],\
    trans_train.iloc[trans_train.shape[0]-2340:, :]

In [None]:
print(f"all train data shape: {trans_train.shape}")
print(f"train data shape: {train_df.shape}")
print(f"validation data shape: {val_df.shape}")
print(f"test data shape: {test_df.shape}")

In [None]:
def objective(trial):
    # setting optimization parameters
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 10, 500),  # 2のmax_depth乗が良いらしい
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 1e-1),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 1e-1),
        "metric": "mae"
    }

    # train models
    feature_columns = dummy_columns + num_columns + use_columns
    train_X, val_X = train_df[feature_columns], val_df[feature_columns]
    train_y, val_y = train_df[target], val_df[target]

    model = lgb.LGBMRegressor(**params)
    model.fit(train_X, train_y,
              eval_set=(val_X, val_y),
              early_stopping_rounds=10, verbose=0)
    pred = model.predict(val_X)

    # evaluation
    score = mean_absolute_error(val_y, pred)
    return score


study = optuna.create_study()
study.optimize(objective, 100)
print(study.best_params)
print(study.best_value)

In [None]:
# train model using best prameters
feature_columns = dummy_columns + num_columns + use_columns
train_X, val_X = train_df[feature_columns], val_df[feature_columns]
train_y, val_y = train_df[target], val_df[target]

test_X, test_y = test_df[feature_columns], test_df[target]

model = lgb.LGBMRegressor(**study.best_params)
model.fit(train_X, train_y,
          eval_set=(val_X, val_y),
          early_stopping_rounds=10, verbose=10,
          eval_metric="mae")
pred = model.predict(test_X)

score = mean_absolute_error(test_y, pred)
print(f"Validation Score: {score}")

## Make submission  

In [None]:
train_df = pd.read_csv(os.path.join(DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DIR, "test.csv"))

train_df, test_df = preprocessing(train_df, test_df)

for column in cat_columns:
    tmp_train = pd.get_dummies(train_df[column], prefix=column, drop_first=False)
    tmp_test = pd.get_dummies(test_df[column], prefix=column, drop_first=False)

    train_df = pd.concat((train_df, tmp_train), axis=1)
    test_df = pd.concat((test_df, tmp_test), axis=1)

    dummy_columns.extend(tmp_train.columns.values.tolist())

for column in train_df.columns:
    if column not in test_df.columns:
        test_df[column] = 0

print(f"train data shape: {train_df.shape}")
print(f"test data shape: {test_df.shape}")

In [None]:
model_dict = dict()

train_X = train_df[feature_columns]
train_y = train_df[target]

# train models
for i in range(8):
    random_state = random_state_list[i]
    model = lgb.LGBMRegressor(**study.best_params, random_state=random_state)
    model.fit(train_X, train_y)

    model_dict[f"model{i}"] = model

In [None]:
submission_dict = dict()

region_list = test_df["region"].unique()
for i in range(8):
    submission = pd.DataFrame(columns=["row_id", "congestion"])
    model = model_dict[f"model{i}"]
    for region in region_list:
        train_region_df = train_df.query("region == @region")
        test_region_df = test_df.query("region == @region")
        train_size = train_region_df.shape[0]

        region_df = pd.concat((train_region_df, test_region_df)).reset_index(drop=True)

        for j in range(test_region_df.shape[0]):
            target_id = train_size + j

            # calculate lag features and rolling features
            region_df.loc[target_id, "lag_1"] = region_df.loc[target_id-1, "congestion"]
            region_df.loc[target_id, "expand_congestion"] = region_df.loc[:target_id-1, "congestion"].median()
            for k in [2, 3, 6, 72]:
                region_df.loc[target_id, f"lag_{k}"] = region_df.loc[target_id-k, "congestion"]
                region_df.loc[target_id, f"lag_avg{k}"] = region_df.loc[target_id-k:target_id-1, "congestion"].mean()
                region_df.loc[target_id, f"lag_std{k}"] = region_df.loc[target_id-k:target_id-1, "congestion"].std()

            # predict
            pred = model.predict(region_df.loc[target_id, feature_columns].values.reshape(1, -1))[0]

            submission = submission.append({"row_id": [region_df.loc[target_id, "row_id"]][0],
                                            "congestion": pred}, ignore_index=True)
            region_df.loc[target_id, target] = pred

    submission = submission.sort_values("row_id").reset_index(drop=True)
    submission["row_id"] = submission["row_id"].astype(int)
    submission_dict[f"submission{i}"] = submission

In [None]:
# plot feature importances
feature_importances = model.feature_importances_
columns = train_X.columns
indices = feature_importances.argsort()

plt.figure(figsize=(20, 20))
feature_importances = feature_importances[indices]
columns = columns[indices]

plt.barh(columns, feature_importances)
plt.show()

In [None]:
for i in range(8):
    submission_dict[f"submission{i}"].to_csv(f"LGBEnsembleSubmission{i}.csv", index=False)
    display(submission_dict[f"submission{i}"].head())

In [None]:
# make ensemble submission(mean values)
submission = np.zeros(shape=(2340))
for sub_df in submission_dict.values():
    pred = sub_df["congestion"].values
    submission += pred

mean_pred = submission / 8
submission = submission_dict["submission0"]
submission["congestion"] = mean_pred
submission.to_csv("LGBMEnsembleMean.csv", index=False)

In [None]:
# make ensemble submission(median values)
submission = np.zeros(shape=(2340))
for sub_df in submission_dict.values():
    pred = sub_df["congestion"].values
    submission = np.vstack((submission, pred))

submission = submission[1:]
median_pred = np.median(submission, axis=0)

submission = submission_dict["submission0"]
submission["congestion"] = median_pred
submission.to_csv("LGBMEnsembleMedian.csv", index=False)

In [None]:
# transform to integer values
submission["congestion"] = submission["congestion"].astype(int)
submission.to_csv("LGBMEnsembleMedianInteger.csv", index=False)