In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import joblib

In [6]:
df_merged = [None for _ in range(3)]
df_merged[0] = pd.read_pickle('../../Datasets/merged_df_6.pkl')
df_merged[1] = pd.read_pickle('../../Datasets/merged_df_7.pkl')
df_merged[2] = pd.read_pickle('../../Datasets/merged_df_8.pkl')

In [7]:
def createdataset(df, timebucket):
    if timebucket == 1:
        df_copy = df.copy()
        df_copy['time_bucket'] = df_copy['hour_of_day']
        df_copy.drop(columns=['hour_of_day', 'date'], inplace = True)
        return df_copy
    df_copy = df.copy()
    df_copy['time_bucket'] = pd.cut(df_copy['hour_of_day'], bins=range(-1, 25, timebucket), labels=False)
    df_copy = df_copy.drop(columns= ['hour_of_day'])
    df_copy['time_bucket'] = df_copy['time_bucket'].astype(int)
    df_copy['temperature'] = df_copy.groupby(['time_bucket', 'date'])['temperature'].transform('mean')
    df_copy['dew_point'] = df_copy.groupby(['time_bucket', 'date'])['dew_point'].transform('mean')
    df_copy['humidity'] = df_copy.groupby(['time_bucket', 'date'])['humidity'].transform('mean')
    df_copy['wind_speed'] = df_copy.groupby(['time_bucket', 'date'])['wind_speed'].transform('mean')
    df_copy['wind_gust'] = df_copy.groupby(['time_bucket', 'date'])['wind_gust'].transform('mean')
    df_copy['pressure'] = df_copy.groupby(['time_bucket', 'date'])['pressure'].transform('mean')
    df_copy['precipitation_rate'] = df_copy.groupby(['time_bucket', 'date'])['precipitation_rate'].transform('mean')
    df_copy['demand'] = df_copy.groupby(['date', 'time_bucket', 'hex_id'])['demand'].transform('sum')
    df_copy = df_copy.drop_duplicates(subset=['hex_id', 'date', 'time_bucket'])
    df_copy.drop(columns=['date'], inplace = True)
    return df_copy

In [8]:
label_encoder = LabelEncoder()
timebuckets = [1, 2, 4, 6, 24]
num_resolutions = len(df_merged)
df_data = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_train_X = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_valid_X = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_test_X = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_train_y = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_valid_y = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_test_y = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
train_dl = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
val_dl = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
test_dl = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
for i in range(num_resolutions):
    for j in range(len(timebuckets)):
        feature_scaler = MinMaxScaler()
        target_scaler = MinMaxScaler()
        df_data[i][j] = createdataset(df_merged[i], timebuckets[j])
        df_data[i][j]['hex_id'] = label_encoder.fit_transform(df_data[i][j]['hex_id'])
        X = df_data[i][j].drop('demand', axis=1).values
        y = df_data[i][j]['demand'].values
        df_train_X[i][j], X_temp, df_train_y[i][j], y_temp = train_test_split(X, y, test_size = 0.3, random_state=42)
        df_valid_X[i][j], df_test_X[i][j], df_valid_y[i][j], df_test_y[i][j] = train_test_split(X_temp, y_temp, test_size = 1/3, random_state=42)
        df_train_X[i][j] = feature_scaler.fit_transform(df_train_X[i][j])
        df_valid_X[i][j] = feature_scaler.transform(df_valid_X[i][j])
        df_test_X[i][j] = feature_scaler.transform(df_test_X[i][j])
        joblib.dump(feature_scaler, f'scalers/feature_scaler_res_{i + 6}_bucket_{timebuckets[j]}.pkl')
        df_train_y[i][j] = target_scaler.fit_transform(df_train_y[i][j].reshape(-1, 1)).flatten()
        df_valid_y[i][j] = target_scaler.transform(df_valid_y[i][j].reshape(-1, 1)).flatten()
        df_test_y[i][j] = target_scaler.transform(df_test_y[i][j].reshape(-1, 1)).flatten()
        joblib.dump(target_scaler, f'scalers/target_scaler_res_{i + 6}_bucket_{timebuckets[j]}.pkl')
        df_train_X[i][j] = torch.tensor(df_train_X[i][j], dtype = torch.float32)
        df_train_y[i][j] = torch.tensor(df_train_y[i][j], dtype = torch.float32).view(-1, 1)
        df_valid_X[i][j] = torch.tensor(df_valid_X[i][j], dtype = torch.float32)
        df_valid_y[i][j] = torch.tensor(df_valid_y[i][j], dtype=torch.float32).view(-1, 1)
        df_test_X[i][j] = torch.tensor(df_test_X[i][j], dtype=torch.float32)
        df_test_y[i][j] = torch.tensor(df_test_y[i][j], dtype=torch.float32).view(-1, 1)
        torch.save(TensorDataset(df_train_X[i][j], df_train_y[i][j]), f'datasets/train_dataset_res_{i + 6}_bucket_{timebuckets[j]}.pt')
        torch.save(TensorDataset(df_valid_X[i][j], df_valid_y[i][j]), f'datasets/valid_dataset_res_{i + 6}_bucket_{timebuckets[j]}.pt')
        torch.save(TensorDataset(df_test_X[i][j], df_test_y[i][j]), f'datasets/test_dataset_res_{i + 6}_bucket_{timebuckets[j]}.pt')
del(df_merged, df_data, df_train_X, df_valid_X, df_test_X, df_train_y, df_valid_y, df_test_y)