# Machine Learning - Data Preparation

This script prepares previously gained datasets for actual machine learning training, validation, and testing. 

In [1]:
# Run this command to install all required libraries for this script
!pip install -q pandas matplotlib torch scikit-learn joblib

In [2]:
# Import all required libraries
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import joblib

In [3]:
# Load all dataset for three geospatial resolutions
df_merged = [None for _ in range(3)]
df_merged[0] = pd.read_pickle('../../Datasets/merged_df_6.pkl')
df_merged[1] = pd.read_pickle('../../Datasets/merged_df_7.pkl')
df_merged[2] = pd.read_pickle('../../Datasets/merged_df_8.pkl')

In [4]:
def createdataset(df, timebucket):
    # Copy the original DataFrame to avoid modifying it directly
    df_copy = df.copy()
    if timebucket == 1:
        # Use 'hour_of_day' as 'time_bucket' if timebucket is 1
        df_copy['time_bucket'] = df_copy['hour_of_day']
        # Remove 'hour_of_day' and 'date' columns
        df_copy.drop(columns=['hour_of_day', 'date'], inplace=True)
        return df_copy
    # Create time buckets based on 'hour_of_day' and the specified 'timebucket' size
    df_copy['time_bucket'] = pd.cut(df_copy['hour_of_day'], bins=range(-1, 25, timebucket), labels=False)
    # Drop the original 'hour_of_day' column and convert 'time_bucket' to integer
    df_copy.drop(columns=['hour_of_day'], inplace=True)
    df_copy['time_bucket'] = df_copy['time_bucket'].astype(int)
    # Aggregate the DataFrame by mean for weather-related columns
    for col in ['temperature', 'dew_point', 'humidity', 
                'wind_speed', 'wind_gust', 'pressure', 
                'precipitation_rate']:
        df_copy[col] = df_copy.groupby(['time_bucket', 'date'])[col].transform('mean')
    # Sum up the 'demand' column within each group of 'date', 'time_bucket', and 'hex_id'
    df_copy['demand'] = df_copy.groupby(['date', 'time_bucket', 'hex_id'])['demand'].transform('sum')
    # Remove duplicate rows based on 'hex_id', 'date', and 'time_bucket'
    df_copy.drop_duplicates(subset=['hex_id', 'date', 'time_bucket'], inplace=True)
    # Drop the 'date' column as it is no longer needed
    df_copy.drop(columns=['date'], inplace=True)
    return df_copy

In [5]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Define time bucket intervals
timebuckets = [1, 2, 4, 6, 24]
# Initialize empty lists for data storage
num_resolutions = len(df_merged)
df_data = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_train_X = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_valid_X = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_test_X = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_train_y = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_valid_y = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
df_test_y = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
train_dl = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
val_dl = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
test_dl = [[None for _ in range(len(timebuckets))] for _ in range(num_resolutions)]
# Iterate over each resolution and time bucket
for i in range(num_resolutions):
    for j in range(len(timebuckets)):
        # Initialize scalers for features and target
        feature_scaler = MinMaxScaler()
        target_scaler = MinMaxScaler()
        # Create dataset for the current resolution and time bucket
        df_data[i][j] = createdataset(df_merged[i], timebuckets[j])
        # Encode 'hex_id' using LabelEncoder
        df_data[i][j]['hex_id'] = label_encoder.fit_transform(df_data[i][j]['hex_id'])
        # Separate features (X) and target (y)
        X = df_data[i][j].drop('demand', axis=1).values
        y = df_data[i][j]['demand'].values
        # Split data into training and temp sets, then split temp into validation and test sets
        df_train_X[i][j], X_temp, df_train_y[i][j], y_temp = train_test_split(
            X, y, test_size=0.3, random_state=42)
        df_valid_X[i][j], df_test_X[i][j], df_valid_y[i][j], df_test_y[i][j] = train_test_split(
            X_temp, y_temp, test_size=1/3, random_state=42)
        # Scale features using MinMaxScaler
        df_train_X[i][j] = feature_scaler.fit_transform(df_train_X[i][j])
        df_valid_X[i][j] = feature_scaler.transform(df_valid_X[i][j])
        df_test_X[i][j] = feature_scaler.transform(df_test_X[i][j])
        # Save the feature scaler
        joblib.dump(feature_scaler, f'scalers/feature_scaler_res_{i + 6}_bucket_{timebuckets[j]}.pkl')
        # Scale target using MinMaxScaler
        df_train_y[i][j] = target_scaler.fit_transform(df_train_y[i][j].reshape(-1, 1)).flatten()
        df_valid_y[i][j] = target_scaler.transform(df_valid_y[i][j].reshape(-1, 1)).flatten()
        df_test_y[i][j] = target_scaler.transform(df_test_y[i][j].reshape(-1, 1)).flatten()
        # Save the target scaler
        joblib.dump(target_scaler, f'scalers/target_scaler_res_{i + 6}_bucket_{timebuckets[j]}.pkl')
        # Convert data to PyTorch tensors
        df_train_X[i][j] = torch.tensor(df_train_X[i][j], dtype=torch.float32)
        df_train_y[i][j] = torch.tensor(df_train_y[i][j], dtype=torch.float32).view(-1, 1)
        df_valid_X[i][j] = torch.tensor(df_valid_X[i][j], dtype=torch.float32)
        df_valid_y[i][j] = torch.tensor(df_valid_y[i][j], dtype=torch.float32).view(-1, 1)
        df_test_X[i][j] = torch.tensor(df_test_X[i][j], dtype=torch.float32)
        df_test_y[i][j] = torch.tensor(df_test_y[i][j], dtype=torch.float32).view(-1, 1)
        # Save datasets as PyTorch TensorDataset
        torch.save(TensorDataset(df_train_X[i][j], df_train_y[i][j]), f'datasets/train_dataset_res_{i + 6}_bucket_{timebuckets[j]}.pt')
        torch.save(TensorDataset(df_valid_X[i][j], df_valid_y[i][j]), f'datasets/valid_dataset_res_{i + 6}_bucket_{timebuckets[j]}.pt')
        torch.save(TensorDataset(df_test_X[i][j], df_test_y[i][j]), f'datasets/test_dataset_res_{i + 6}_bucket_{timebuckets[j]}.pt')

# Delete variables to free up memory
del(df_merged, df_data, df_train_X, df_valid_X, df_test_X, df_train_y, df_valid_y, df_test_y)