# Kaggle Notebook to Refer Feature Engineering Tips  
In generally, feature engineering and EDA are more important than hyperparameter tuning, change models to improve (generalization) score.   

Then I collect kaggle notebook(code) to refer feature engineering techniques and combine these techniques to plot feature importances by using lightgbm.   

## Import Library  

In [None]:
import warnings
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import optuna  # optimize model by Bayesian optimization

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 50)
warnings.filterwarnings("ignore")
optuna.logging.disable_default_handler()  # don't display optuna log

%matplotlib inline

## Load Data  

In [None]:
DIR = "../input/tabular-playground-series-mar-2022"
train = pd.read_csv(os.path.join(DIR, "train.csv"))
test = pd.read_csv(os.path.join(DIR, "test.csv"))

## Feature Engineering  
Reference Notebook 1: TPS_2022_03_LGBM  
@kotrying  
URL: https://www.kaggle.com/code/kotrying/tps-2022-03-lgbm  

Reference Notebook 2: TPS Mar2022 Single LGBM - LB 4.91 - Run time 91s  
@ifashion  
URL: https://www.kaggle.com/code/ifashion/tps-mar2022-single-lgbm-lb-4-91-run-time-91s  

I combined method of feature engineering in these notebooks and plot feature importances.  

In [None]:
# make flag of official holiday
# but delete official holiday data in original notebook(ref1)
train["time"] = pd.to_datetime(train["time"])
train["official_holiday"] = train["time"].dt.date.astype(str).str.contains('1991-05-27|1991-07-04|1991-09-02').astype('int')
test["official_holiday"] = 0

In [None]:
def make_new_columns(df):
    # make categorical features by adding original categorical features
    # "x" + "y" + "direction"
    df['region_xy'] = df['x'].astype(str) + df['y'].astype(str)
    df['xydir'] =\
        df['x'].astype(str) + df['y'].astype(str) + df['direction']

    return df

In [None]:
def make_lag_features(df):
    # target value on yesterday
    df["yesterday"] = df.groupby(["x", "y", "direction", "hour", "minute"])["congestion"].transform(lambda x: x.shift(1))

    # target value on last week
    df["lastweek"] = df.groupby(["x", "y", "direction", "hour", "minute"])["congestion"].transform(lambda x: x.shift(7))

    # target at 20min, 40min, 1h and 2f ago
    df["lag_1"] = df.groupby("xydir")["congestion"].shift(1)
    for i in [2, 3, 6]:
        df[f"lag_{i}"] = df.groupby("xydir")['congestion'].shift(i)
        df[f"lag_avg{i}"] = df.groupby("xydir")["congestion"].transform(lambda x: x.rolling(i).mean().shift(1))
        df[f"lag_std{i}"] = df.groupby("xydir")["congestion"].transform(lambda x: x.rolling(i).std().shift(1))

    return df

In [None]:
def make_agg_features(df):
    # median
    df["median_cong"] = df.groupby(["x", "y", "direction", "hour", "minute"])["congestion"].transform(lambda x: x.median())

    # rolling std in 1 week
    # but I think this feature is leak(should shift 1)
    df["rolling_7_std"] = df.groupby(["x", "y", "direction", "hour", "minute"])["congestion"].transform(lambda x: x.rolling(7).std().shift(1))

    return df

In [None]:
def transform_time(df):
    # transform to datetime type
    df["time"] = pd.to_datetime(df["time"])

    # extract time features(month, day, hour, minute, weekday)
    df["month"] = df["time"].dt.month
    df["day"] = df["time"].dt.dayofyear
    df["hour"] = df["time"].dt.hour
    df["minute"] = df["time"].dt.minute
    df["weekday"] = df["time"].dt.weekday

    # the week of the year
    df["week"] = df["time"].dt.week

    # make flag of am, Saturday, and Sunday
    df["am"] = ((df["hour"] < 12) & (df["hour"] > 6)).astype("int8")
    df["Saturday"] = (df["weekday"] == 5).astype("int8")
    df["Sunday"] = (df["weekday"] == 6).astype("int8")

    # I saw this technique for the first time
    # transform time feature(plot later)
    df["time"] = (df["time"].dt.hour-12)*3 + df["time"].dt.minute/20

    return df

In [None]:
def tri_transform_time(df):
    # extract time features(month, day, hour, minute, weekday)
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df["weekday"] = df["time"].dt.weekday

    # the week of the year
    df["week"] = df["time"].dt.week

    # make flag of am, Saturday, and Sunday
    df["am"] = ((df["hour"] < 12) & (df["hour"] > 6)).astype("int8")
    df["Saturday"] = (df["weekday"] == 5).astype("int8")
    df["Sunday"] = (df["weekday"] == 6).astype("int8")

    # transform using trigonometric function
    month_list = [4, 6, 9]
    df[['day_sin', 'day_cos']] = 0
    for month in df['month'].unique():
        if month in month_list:
            df.loc[df['month'] == month, 'day_sin'] =\
                df.query("month == @month")['day'].apply(lambda x: np.sin((2*np.pi*x) / 30))
            df.loc[df['month'] == month, 'day_cos'] =\
                df.query("month == @month")['day'].apply(lambda x: np.cos((2*np.pi*x) / 30))
        else:
            df.loc[df['month'] == month, 'day_sin'] =\
                df.query("month == @month")['day'].apply(lambda x: np.sin((2*np.pi*x) / 31))
            df.loc[df['month'] == month, 'day_cos'] =\
                df.query("month == @month")['day'].apply(lambda x: np.cos((2*np.pi*x) / 31))

    df['month_sin'] = np.sin(2*np.pi*df['month'] / 12)
    df['month_cos'] = np.cos(2*np.pi*df['month'] / 12)
    df['hour_sin'] = np.sin(2*np.pi*df['hour'] / 24)
    df['hour_cos'] = np.cos(2*np.pi*df['hour'] / 24)
    df['minute_sin'] = np.sin(2*np.pi*df['minute'] / 60)
    df['minute_cos'] = np.cos(2*np.pi*df['minute'] / 60)

    return df

In [None]:
def preprocessing(train, test):
    # store the train data size to divide train data and test data after feature engineering.
    train_size = train.shape[0]  # len(train)

    # cancatenate train data and test data
    df = pd.concat((train, test))

    # make categorical features by adding original categorical features
    df = make_new_columns(df)

    # transform time features
    df = transform_time(df)

    # lag features
    df = make_lag_features(df)

    # aggregation features
    df = make_agg_features(df)

    # categorical features are applied label encoding
    cat_columns = ["direction", "xydir", "region_xy"]
    for column in cat_columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

    # divide train data and test data
    train, test = df.iloc[:train_size], df.iloc[train_size:]

    return train, test

In [None]:
%%time
trans_train, trans_test = preprocessing(train.copy(), test.copy())
display(trans_train)
display(trans_test)

### Plot transformd time  
In am is negative values(-36 ~ -1.0) and in pm is positive values(1.0 ~ 35) to transform at line 42 in cell 3.  

I think can ensure temporal characteristics by transforming that.  

In [None]:
# extract data by xydir
xydir = trans_train.loc[0, "xydir"]
ex_data = trans_train.query("xydir == @xydir")

# extract data by day
month = ex_data.loc[0, "month"]
day = ex_data.loc[0, "day"]
ex_data = ex_data.query("month == @month & day == @day")

print(ex_data.shape)
ex_data

In [None]:
# plot time feature
plt.figure(figsize=(12, 16))
plt.plot(range(72), ex_data["time"])
plt.show()

In [None]:
# plot a 3 days
# extract data by xydir
xydir = trans_train.loc[0, "xydir"]
ex_data = trans_train.query("xydir == @xydir")

# extract data by day
month = ex_data.loc[0, "month"]
ex_data = ex_data.query("month == @month & day < 94")

print(ex_data.shape)
ex_data

In [None]:
plt.figure(figsize=(16, 12))
plt.plot(range(216), ex_data["time"])
plt.show()

## Train Model  
I use lgb.LGBMRegressor and plot feature importances.  

In [None]:
# divide train dataset and validatin dataset
train_data, val_data, test_data =\
    trans_train.iloc[:trans_train.shape[0]-4680, :],\
    trans_train.iloc[trans_train.shape[0]-4680:trans_train.shape[0]-2340, :],\
    trans_train.iloc[trans_train.shape[0]-2340:, :]
display(train_data)
display(val_data)

In [None]:
# make list of use features
columns = ["x", "y", "direction", "official_holiday", "region_xy",
           "xydir", "month", "day", "hour", "minute", "weekday", "week",
           "am", "Saturday", "Sunday", "yesterday", "lastweek", "lag_1",
           "lag_2", "lag_avg2", "lag_std2", "lag_3", "lag_avg3", "lag_std3",
           "lag_6", "lag_avg6", "lag_std6", "median_cong", "rolling_7_std"]

target = "congestion"

train_X, val_X, test_X = train_data[columns], val_data[columns], test_data[columns]
train_y, val_y, test_y = train_data[target], val_data[target], test_data[target]

In [None]:
# optimize model by using optuna
def objective(trial):
    # setting parameters
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 10, 500),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 1e-1),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 1e-1),
        "n_jobs": 12
    }

    # train model
    model = lgb.LGBMRegressor(**params)
    model.fit(train_X, train_y,
              eval_set=(val_X, val_y),
              early_stopping_rounds=50, verbose=0,
              eval_metric="mae")
    pred = model.predict(val_X)

    # evaluation
    score = mean_absolute_error(val_y, pred)
    return score


# optimization
study = optuna.create_study()
study.optimize(objective, 100)

# display best paramters and score
print(study.best_params)
print(study.best_value)

In [None]:
# train model
model = lgb.LGBMRegressor(**study.best_params)
model.fit(train_X, train_y, eval_set=(val_X, val_y),
          early_stopping_rounds=50, verbose=100, eval_metric="mae")

# predict
pred = model.predict(test_X)

# round predict values
round_pred = np.round(pred)

print(f"validation MAE: {mean_absolute_error(test_y, pred)}")
print(f"validation MAE(round): {mean_absolute_error(test_y, round_pred)}")

### Plot feature importance  

In [None]:
# get feature importances
feature_importances = model.feature_importances_
# make indices to sort
indices = np.argsort(feature_importances)

# sort features importances
feature_importances = feature_importances[indices]

# sort feature names
sort_columns = np.array(columns)[indices]

# plot feature importances
plt.figure(figsize=(12, 12))
plt.barh(sort_columns, feature_importances)
plt.show()

## Make Prediction  

In [None]:
train_X = trans_train[columns]
train_y = trans_train[target]

model = lgb.LGBMRegressor(**study.best_params)
model.fit(train_X, train_y, eval_set=(train_X, train_y),
          early_stopping_rounds=10, verbose=100, eval_metric="mae")

In [None]:
submission = pd.DataFrame(columns=["row_id", "congestion"])

region_list = trans_test["xydir"].unique()
for region in region_list:
    # regionごとにDataFrameを抽出
    train_region_df = trans_train.query("xydir == @region")
    test_region_df = trans_test.query("xydir == @region")
    train_size = train_region_df.shape[0]

    region_df = pd.concat((train_region_df, test_region_df)).reset_index(drop=True)

    # 時系列順に予測，予測結果を用いてラグ特徴量を計算
    for i in range(test_region_df.shape[0]):
        target_id = train_size + i

        region_df.loc[target_id, "lag_1"] = region_df.loc[target_id-1, "congestion"]
        for i in [2, 3, 6, 72]:
            region_df.loc[target_id, f"lag_{i}"] = region_df.loc[target_id-i, "congestion"]
            region_df.loc[target_id, f"lag_avg{i}"] = region_df.loc[target_id-i:target_id-1, "congestion"].mean()
            region_df.loc[target_id, f"lag_std{i}"] = region_df.loc[target_id-i:target_id-1, "congestion"].std()

        # 予測
        pred = model.predict(region_df.loc[target_id, columns].values.reshape(1, -1))[0]

        # 予測した値を提出ファイルに格納
        submission = submission.append({"row_id": [region_df.loc[target_id, "row_id"]][0],
                                        "congestion": pred}, ignore_index=True)
        # ラグ特徴量の計算のためにDataFrameにも格納
        region_df.loc[target_id, target] = pred

submission = submission.sort_values("row_id").reset_index(drop=True)
submission["row_id"] = submission["row_id"].astype(int)
display(submission)

## Postprocessing  
Post process in Reference Notebook 1. MAE reduced 0.1 by using this process than don't use. 

This process(clipping) seems to be important in this competittion.  

In [None]:
sep = trans_train[trans_train['month'] >= 9]
lower = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.15).values[2340:]
upper = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.7).values[2340:]

In [None]:
clip_submission = submission.copy()
clip_submission["congestion"] = submission["congestion"].clip(lower, upper)
display(clip_submission)

In [None]:
display(submission[submission["congestion"] != clip_submission["congestion"]])
display(clip_submission[submission["congestion"] != clip_submission["congestion"]])

In [None]:
round_submission = submission.copy()
round_submission["congestion"] = np.round(submission["congestion"])
display(round_submission)