In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('../')

from bikesharing.interface.main import preprocess
from bikesharing.params import *

In [2]:
X, y = preprocess()
print(X.shape)
print(y.shape)

[34m
Load preprocessed data from local CSV...[0m
(35040, 13)
(35040, 34)


In [3]:
X.head(5)

Unnamed: 0,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos,is_holiday,is_weekend
0,0.355408,1.0,0.342007,0.227848,0.017391,0.62941,0.982963,0.75,0.933013,0.600779,0.989739,1,0
1,0.357616,0.987013,0.340149,0.24557,0.008696,0.75,0.933013,0.75,0.933013,0.600779,0.989739,1,0
2,0.359823,1.0,0.336431,0.303797,0.017391,0.853553,0.853553,0.75,0.933013,0.600779,0.989739,1,0
3,0.359823,0.987013,0.332714,0.341772,0.008696,0.933013,0.75,0.75,0.933013,0.600779,0.989739,1,0
4,0.359823,1.0,0.332714,0.356962,0.0,0.982963,0.62941,0.75,0.933013,0.600779,0.989739,1,0


In [4]:
y.head(5)

Unnamed: 0,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,Laim,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data = pd.concat([X, y], axis=1)
print(data.shape)
print(data.head(3))

(35040, 47)
   temperature_2m  relativehumidity_2m  apparent_temperature  windspeed_10m  \
0        0.355408             1.000000              0.342007       0.227848   
1        0.357616             0.987013              0.340149       0.245570   
2        0.359823             1.000000              0.336431       0.303797   

   precipitation  hour_sin  hour_cos  month_sin  month_cos   day_sin  ...  \
0       0.017391  0.629410  0.982963       0.75   0.933013  0.600779  ...   
1       0.008696  0.750000  0.933013       0.75   0.933013  0.600779  ...   
2       0.017391  0.853553  0.853553       0.75   0.933013  0.600779  ...   

   Schwanthalerhöhe  Sendling  Sendling-Westpark  Südgiesing  Thalkirchen  \
0               0.0       0.0                2.0         0.0          0.0   
1               0.0       2.0                0.0         0.0          0.0   
2               0.0       5.0                0.0         0.0          0.0   

   Trudering  Trudering-Riem  Untergiesing  Untergies

In [None]:
# data.to_csv('../raw_data/final_processed_xy_data.csv')
data = pd.read_csv('../raw_data/final_processed_xy_data.csv', index=False)
data.shape

## Train-Test split

In [7]:
## Functions for train-test split

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Sequence

def get_folds(
    df: pd.DataFrame,
    fold_length: int,
    fold_stride: int) -> List[pd.DataFrame]:
    """
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold

    Args:
        df (pd.DataFrame): Overall dataframe
        fold_length (int): How long each fold should be in rows
        fold_stride (int): How many timesteps to move forward between taking each fold

    Returns:
        List[pd.DataFrame]: A list where each fold is a dataframe within
    """
    folds = []
    for idx in range(0, len(df), fold_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (idx + fold_length) > len(df):
            break
        fold = df.iloc[idx:idx + fold_length, :]
        folds.append(fold)
    return folds

def train_test_indices(fold:pd.DataFrame,
                    train_test_ratio: float,
                    input_length: int) -> Tuple[pd.DataFrame]:
    """From a fold dataframe, take a train dataframe and test dataframe based on
    the split ratio.
    - df_train contains all the timesteps until round(train_test_ratio * len(fold))
    - df_test contains all the timesteps needed to create all (X_test, y_test) tuples

    Args:
        fold (pd.DataFrame): A fold of timesteps
        train_test_ratio (float): The ratio between train and test 0-1
        input_length (int): How long each X_i will be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (fold_train, fold_test)
    """
def train_test_split(fold:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> Tuple[pd.DataFrame]:
    """From a fold dataframe, take a train dataframe and test dataframe based on
    the split ratio.
    - df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    - df_test should contain all the timesteps needed to create all (X_test, y_test) tuples

    Args:
        fold (pd.DataFrame): A fold of timesteps
        train_test_ratio (float): The ratio between train and test 0-1
        input_length (int): How long each X_i will be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (fold_train, fold_test)
    """

    # TRAIN SET
    last_train_idx = round(train_test_ratio * len(fold))
    fold_train = fold.iloc[0:last_train_idx, :]

    # TEST SET
    first_test_idx = last_train_idx - input_length
    fold_test = fold.iloc[first_test_idx:, :]

    return (fold_train, fold_test)

In [8]:
districts = y.columns
districts

Index(['Altstadt-Lehel', 'Au - Haidhausen', 'Aubing-Lochhausen-Langwied',
       'Berg am Laim', 'Bogenhausen', 'Feldmoching', 'Hadern', 'Harlaching',
       'Hasenbergl-Lerchenau Ost', 'Laim', 'Lochhausen',
       'Ludwigsvorstadt-Isarvorstadt', 'Maxvorstadt', 'Milbertshofen-Am Hart',
       'Moosach', 'Neuhausen-Nymphenburg', 'Obergiesing', 'Obermenzing',
       'Obersendling', 'Pasing', 'Pasing-Obermenzing', 'Ramersdorf-Perlach',
       'Schwabing-Freimann', 'Schwabing-West', 'Schwanthalerhöhe', 'Sendling',
       'Sendling-Westpark', 'Südgiesing', 'Thalkirchen', 'Trudering',
       'Trudering-Riem', 'Untergiesing', 'Untergiesing-Harlaching',
       'Untermenzing-Allach'],
      dtype='object')

In [9]:
FOLD_LENGTH = 17520 # 2 years
FOLD_STRIDE = 2184 # 3 months
TRAIN_TEST_RATIO = 0.8
INPUT_LENGTH = 336 # 24 h * 14 d
OUTPUT_LENGTH = 24

folds = get_folds(data, FOLD_LENGTH, FOLD_STRIDE)
print('Number of folds:', len(folds))

Number of folds: 9


In [10]:
# First fold
(fold_train, fold_test) = train_test_split(folds[0], TRAIN_TEST_RATIO, INPUT_LENGTH)
print(fold_train.shape)
print(fold_test.shape)

(14016, 47)
(3840, 47)


## Prediction for All Districts

In [11]:
TARGET = districts
N_TARGETS = len(districts)
N_FEATURES = len(X.columns)
print('N_FEATURES:', N_FEATURES)

N_FEATURES: 13


In [12]:
## Functions for train-test split
def get_Xi_yi(
    fold:pd.DataFrame,
    input_length:int,
    output_length:int) -> Tuple[pd.DataFrame]:
    """given a fold, it returns one sequence (X_i, y_i) as based on the desired
    input_length and output_length with the starting point of the sequence being chosen at random based

    Args:
        fold (pd.DataFrame): A single fold
        input_length (int): How long each X_i should be
        output_length (int): How long each y_i should be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (X_i, y_i)
    """

    first_possible_start = 0
    last_possible_start = len(fold) - (input_length + output_length) + 1
    random_start = np.random.randint(first_possible_start, last_possible_start)
    X_i = fold.iloc[random_start:random_start+input_length]
    y_i = fold.iloc[random_start+input_length:
                  random_start+input_length+output_length][TARGET]

    return (X_i, y_i)

def get_X_y(
    fold:pd.DataFrame,
    number_of_sequences:int,
    input_length:int,
    output_length:int) -> Tuple[np.array]:
    """Given a fold generate X and y based on the number of desired sequences
    of the given input_length and output_length

    Args:
        fold (pd.DataFrame): Fold dataframe
        number_of_sequences (int): The number of X_i and y_i pairs to include
        input_length (int): Length of each X_i
        output_length (int): Length of each y_i

    Returns:
        Tuple[np.array]: A tuple of numpy arrays (X, y)
    """
    X, y = [], []

    for i in range(number_of_sequences):
        (Xi, yi) = get_Xi_yi(fold, input_length, output_length)
        X.append(Xi)
        y.append(yi)

    return np.array(X), np.array(y)

In [13]:
X_train_i, y_train_i = get_Xi_yi(fold_train, INPUT_LENGTH, OUTPUT_LENGTH)
X_test_i, y_test_i = get_Xi_yi(fold_test, INPUT_LENGTH, OUTPUT_LENGTH)

In [14]:
print(f'X_train_i: {X_train_i.shape}, y_train: {y_train_i.shape}')
print(f'X_test_i: {X_test_i.shape}, y_test: {y_test_i.shape}')

X_train_i: (336, 47), y_train: (24, 34)
X_test_i: (336, 47), y_test: (24, 34)


In [15]:
# N_TRAIN = 8000 # number_of_sequences_train
# N_TEST =  2000 # number_of_sequences_test

# X_train, y_train = get_X_y(fold_train, N_TRAIN, INPUT_LENGTH, OUTPUT_LENGTH)
# X_test, y_test = get_X_y(fold_test, N_TEST, INPUT_LENGTH, OUTPUT_LENGTH)

# print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
# print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

In [16]:
# X_train[0,0,:]

In [17]:
# y_train[0,0,:]

## Prediction for Maxvorstadt

In [18]:
# Select a single target district
TARGET = 'Maxvorstadt'
N_TARGETS = len(districts)

In [19]:
# Prepare data for the selected target
X_train_i, y_train_i = get_Xi_yi(fold_train, INPUT_LENGTH, OUTPUT_LENGTH)
X_test_i, y_test_i = get_Xi_yi(fold_test, INPUT_LENGTH, OUTPUT_LENGTH)

# Shape of the data
print(f'X_train_i: {X_train_i.shape}, y_train_i: {y_train_i.shape}')
print(f'X_test_i: {X_test_i.shape}, y_test_i: {y_test_i.shape}')

X_train_i: (336, 47), y_train_i: (24,)
X_test_i: (336, 47), y_test_i: (24,)


In [20]:
N_TRAIN = 8000 # number_of_sequences_train
N_TEST =  2000 # number_of_sequences_test

X_train, y_train = get_X_y(fold_train, N_TRAIN, INPUT_LENGTH, OUTPUT_LENGTH)
X_test, y_test = get_X_y(fold_test, N_TEST, INPUT_LENGTH, OUTPUT_LENGTH)

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (8000, 336, 47), y_train: (8000, 24)
X_test: (2000, 336, 47), y_test: (2000, 24)


## RNN Model

In [21]:
y_train = np.reshape(y_train, (8000, 24, 1))
y_test = np.reshape(y_test, (2000, 24, 1))

print(f'y_train_rnn: {y_train.shape}')
print(f'y_test_rnn: {y_test.shape}')

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the model architecture
model = Sequential()
model.add(LSTM(units=64, input_shape=(336, 47)))
model.add(Dense(units=24))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print('Test Loss:', loss)

In [None]:
import matplotlib.pyplot as plt

# Get the training history
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Plot the training and validation loss
epochs = range(1, len(train_loss) + 1)
plt.plot(epochs, train_loss, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()