In [1]:
# reset every objects
%reset -f

In [2]:
import os

import numpy as np
import pandas as pd
import polars as pl
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from catboost import CatBoostRegressor

import warnings
# Filter out specific ValueWarnings from statsmodels
warnings.filterwarnings("ignore")

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import Parameter
from torch import Tensor
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [3]:
# pandas setting
pd.set_option('display.max_column',None)
pd.set_option('display.max_row',None)

In [4]:
class DataStorage:
    root = "/kaggle/input/predict-energy-behavior-of-prosumers"
#     root = '../data' # my root
    
    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [5]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [6]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

print(type(df_train_features))
print(df_train_features.shape)

df_train_features.head(3)

<class 'pandas.core.frame.DataFrame'>
(1651902, 165)


Unnamed: 0_level_0,county,is_business,product_type,is_consumption,day,weekday,month,year,segment,sin(dayofyear),cos(dayofyear),sin(hour),cos(hour),eic_count,installed_capacity,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation,temperature_forecast_local_0h,dewpoint_forecast_local_0h,cloudcover_high_forecast_local_0h,cloudcover_low_forecast_local_0h,cloudcover_mid_forecast_local_0h,cloudcover_total_forecast_local_0h,10_metre_u_wind_component_forecast_local_0h,10_metre_v_wind_component_forecast_local_0h,direct_solar_radiation_forecast_local_0h,surface_solar_radiation_downwards_forecast_local_0h,snowfall_forecast_local_0h,total_precipitation_forecast_local_0h,temperature_forecast_168h,dewpoint_forecast_168h,cloudcover_high_forecast_168h,cloudcover_low_forecast_168h,cloudcover_mid_forecast_168h,cloudcover_total_forecast_168h,10_metre_u_wind_component_forecast_168h,10_metre_v_wind_component_forecast_168h,direct_solar_radiation_forecast_168h,surface_solar_radiation_downwards_forecast_168h,snowfall_forecast_168h,total_precipitation_forecast_168h,temperature_forecast_local_168h,dewpoint_forecast_local_168h,cloudcover_high_forecast_local_168h,cloudcover_low_forecast_local_168h,cloudcover_mid_forecast_local_168h,cloudcover_total_forecast_local_168h,10_metre_u_wind_component_forecast_local_168h,10_metre_v_wind_component_forecast_local_168h,direct_solar_radiation_forecast_local_168h,surface_solar_radiation_downwards_forecast_local_168h,snowfall_forecast_local_168h,total_precipitation_forecast_local_168h,temperature_historical_48h,dewpoint_historical_48h,rain,snowfall_historical_48h,surface_pressure,cloudcover_total_historical_48h,cloudcover_low_historical_48h,cloudcover_mid_historical_48h,cloudcover_high_historical_48h,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation_historical_48h,diffuse_radiation,temperature_historical_local_48h,dewpoint_historical_local_48h,rain_historical_local_48h,snowfall_historical_local_48h,surface_pressure_historical_local_48h,cloudcover_total_historical_local_48h,cloudcover_low_historical_local_48h,cloudcover_mid_historical_local_48h,cloudcover_high_historical_local_48h,windspeed_10m_historical_local_48h,winddirection_10m_historical_local_48h,shortwave_radiation_historical_local_48h,direct_solar_radiation_historical_local_48h,diffuse_radiation_historical_local_48h,temperature_historical_168h,dewpoint_historical_168h,rain_historical_168h,snowfall_historical_168h,surface_pressure_historical_168h,cloudcover_total_historical_168h,cloudcover_low_historical_168h,cloudcover_mid_historical_168h,cloudcover_high_historical_168h,windspeed_10m_historical_168h,winddirection_10m_historical_168h,shortwave_radiation_historical_168h,direct_solar_radiation_historical_168h,diffuse_radiation_historical_168h,temperature_historical_local_168h,dewpoint_historical_local_168h,rain_historical_local_168h,snowfall_historical_local_168h,surface_pressure_historical_local_168h,cloudcover_total_historical_local_168h,cloudcover_low_historical_local_168h,cloudcover_mid_historical_local_168h,cloudcover_high_historical_local_168h,windspeed_10m_historical_local_168h,winddirection_10m_historical_local_168h,shortwave_radiation_historical_local_168h,direct_solar_radiation_historical_local_168h,diffuse_radiation_historical_local_168h,temperature_historical_24h,dewpoint_historical_24h,rain_historical_24h,snowfall_historical_24h,surface_pressure_historical_24h,cloudcover_total_historical_24h,cloudcover_low_historical_24h,cloudcover_mid_historical_24h,cloudcover_high_historical_24h,windspeed_10m_historical_24h,winddirection_10m_historical_24h,shortwave_radiation_historical_24h,direct_solar_radiation_historical_24h,diffuse_radiation_historical_24h,target_48h,target_72h,target_96h,target_120h,target_144h,target_168h,target_192h,target_216h,target_240h,target_264h,target_288h,target_312h,target_336h,target_all_type_sum_48h,target_all_county_type_sum_48h,target_all_type_sum_72h,target_all_county_type_sum_72h,target_all_type_sum_168h,target_all_county_type_sum_168h,target_all_type_sum_336h,target_all_county_type_sum_336h,target_mean,target_std,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1
366048,0,0,1,0,1,6,1,2022,0_0_1_0,0.017166,0.999853,0.0,1.0,148.0,1345.689941,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
366049,0,0,1,1,1,6,1,2022,0_0_1_1,0.017166,0.999853,0.0,1.0,148.0,1345.689941,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,442.226
366050,0,0,2,0,1,6,1,2022,0_0_2_0,0.017166,0.999853,0.0,1.0,16.0,153.699997,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0


In [7]:
DEVIDER = 34
df_train_features_ = df_train_features.drop('segment',axis=1)
max_row_idx = df_train_features_.shape[0]
for i in range(DEVIDER):
    start = i * 50000
    if i == DEVIDER-1:
        end = max_row_idx
    else : end = start + 50000
    globals()[f'df_train_features_{i}'] = df_train_features_.iloc[start:end]
    globals()[f'df_train_features_{i}'][['county','is_business','product_type','is_consumption']] = globals()[f'df_train_features_{i}'][['county','is_business','product_type','is_consumption']].astype(int)

In [8]:
df_train_features_24.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 1566318 to 1616317
Columns: 164 entries, county to target
dtypes: float32(154), float64(2), int32(1), int64(4), int8(3)
memory usage: 32.4 MB


# TRAIN

In [9]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [10]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

### - Prepare Sequence data

In [11]:
### Sequence Data Preparation
SEQUENCE_SIZE = 50

def to_sequence(seq_size, obs):
    x = []
    y = []
    for i in range(len(obs)-seq_size):
        window = obs[i:(i + seq_size)]
        after_window = obs[i + seq_size]
        x.append(window)
        y.append(after_window)
    return torch.tensor(x, dtype=torch.float32).view(-1, seq_size, 1), torch.tensor(y, dtype=torch.float32).view(-1, 1)

x_train = torch.Tensor()
y_train = torch.Tensor()

for i in tqdm(range(DEVIDER-3)):
#     globals()[f'x_train_{i}',f'y_train_{i}'] = to_sequence(SEQUENCE_SIZE,globals()[f'df_train_features_{i}'].values)
    x_train_,y_train_ = to_sequence(SEQUENCE_SIZE,globals()[f'df_train_features_{i}'].values)
    x_train = torch.cat([x_train,x_train_],dim=0)
    y_train = torch.cat([y_train,y_train_],dim=0)
    del x_train_
    del y_train_
    del globals()[f'df_train_features_{i}']
for_test = pd.concat([df_train_features_31,df_train_features_32,df_train_features_33],axis=1)
x_test, y_test = to_sequence(SEQUENCE_SIZE, for_test)
del df_train_features_31
del df_train_features_32
del df_train_features_33

# x_train,y_train = to_sequence(SEQUENCE_SIZE,df_train_features.values)
# x_test,y_test = to_sequence(SEQUENCE_SIZE, df_train_features.values)

### merge seperated train/test data
# x_train = pd.concat([x_train_0,x_train_1,x_train_2,
#                    x_train_3,x_train_4,x_train_5,
#                    x_train_6,x_train_7,x_train_8,
#                    x_train_9,x_train_10,x_train_11,
#                     x_train_12,x_train_13,x_train_14,
#                     x_train_15,x_train_16,x_train_17,
#                     x_train_18,x_train_19,x_train_20,
#                     x_train_21,x_train_22,x_train_23,
#                     x_train_24,x_train_25,x_train_26,
#                     x_train_27,x_train_28,x_train_29,
#                     x_train_30
# #                      x_train_31,x_train_32,x_train_33
#                     ],axis=0)
# y_train = pd.concat([y_train_0,y_train_1,y_train_2,
#                    y_train_3,y_train_4,y_train_5,
#                    y_train_6,y_train_7,y_train_8,
#                    y_train_9,y_train_10,y_train_11,
#                     y_train_12,y_train_13,y_train_14,
#                     y_train_15,y_train_16,y_train_17,
#                     y_train_18,y_train_19,y_train_20,
#                     y_train_21,y_train_22,y_train_23,
#                     y_train_24,y_train_25,y_train_26,
#                     y_train_27,y_train_28,y_train_29,
#                     y_train_30
# #                      ,y_train_31,y_train_32,y_train_33
#                     ],axis=0)                   

# Setup data loaders for batch
train_dataset = TensorDataset(x_train,y_train)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

test_dataset = TensorDataset(x_test,y_test)
test_loader = DataLoader(test_dataset,batch_size=128, shuffle=False)

 23%|██▎       | 7/31 [07:12<24:43, 61.81s/it]


KeyboardInterrupt: 

In [None]:
멈춰

In [None]:
# # delete variables
# for i in range(DEVIDER):
#     del globals()[f'df_train_features_{i}']
#     del globals()[f'x_train_{i}']
#     del globals()[f'x_test_{i}']
#     del globals()[f'y_train_{i}']
#     del globals()[f'y_test_{i}']
    

### - Positional Encoding

In [None]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

### - Transformer Model

In [None]:
# Model definition using Transformer
class TransformerModel(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=4, num_layers=2, dropout=0.2):
        super(TransformerModel, self).__init__()

        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x

model = TransformerModel().to(device)

### - Training Procedure

In [None]:
# Train the model
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3, verbose=True)

epochs = 1000
early_stop_count = 0
min_val_loss = float('inf')

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        x_batch, y_batch = batch
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in test_loader:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            val_losses.append(loss.item())

    val_loss = np.mean(val_losses)
    scheduler.step(val_loss)

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop_count = 0
    else:
        early_stop_count += 1

    if early_stop_count >= 5:
        print("Early stopping!")
        break
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")
