In [9]:
# import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Tuple

ts_data = pd.read_parquet('/Users/rafaelduarte/Projects/airbnb/data/transformed/dallas-calendar_clean-2023-09-12.parquet')

ts_data.head()

Unnamed: 0_level_0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights,Year,Quarter,Month,Week,Weekday,Day,Dayofyear,Weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-09-12,61878,2023-09-12,1.0,85.0,85.0,30,1125,2023,3,9,37,1,12,255,0
2023-09-13,61878,2023-09-13,1.0,85.0,85.0,30,1125,2023,3,9,37,2,13,256,0
2023-09-14,61878,2023-09-14,1.0,85.0,85.0,30,1125,2023,3,9,37,3,14,257,0
2023-09-15,61878,2023-09-15,1.0,85.0,85.0,30,1125,2023,3,9,37,4,15,258,0
2023-09-16,61878,2023-09-16,1.0,85.0,85.0,30,1125,2023,3,9,37,5,16,259,1


For our fist prediction, we're going to work only with one `listing_id`, which in this case will be `795703`.

In [4]:
# selecting only the desired listing_id
df = ts_data[ts_data['listing_id'] == 795703]

# checking the dataframe
print(df.available.value_counts())

df.head(10)

0.0    249
1.0    116
Name: available, dtype: int64


Unnamed: 0_level_0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights,Year,Quarter,Month,Week,Weekday,Day,Dayofyear,Weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-09-12,795703,2023-09-12,1.0,229.0,229.0,30,365,2023,3,9,37,1,12,255,0
2023-09-13,795703,2023-09-13,1.0,229.0,229.0,30,365,2023,3,9,37,2,13,256,0
2023-09-14,795703,2023-09-14,1.0,229.0,229.0,30,365,2023,3,9,37,3,14,257,0
2023-09-15,795703,2023-09-15,1.0,279.0,279.0,30,365,2023,3,9,37,4,15,258,0
2023-09-16,795703,2023-09-16,1.0,279.0,279.0,30,365,2023,3,9,37,5,16,259,1
2023-09-17,795703,2023-09-17,1.0,229.0,229.0,30,365,2023,3,9,37,6,17,260,1
2023-09-18,795703,2023-09-18,1.0,229.0,229.0,30,365,2023,3,9,38,0,18,261,0
2023-09-19,795703,2023-09-19,1.0,229.0,229.0,30,365,2023,3,9,38,1,19,262,0
2023-09-20,795703,2023-09-20,1.0,229.0,229.0,30,365,2023,3,9,38,2,20,263,0
2023-09-21,795703,2023-09-21,1.0,229.0,229.0,30,365,2023,3,9,38,3,21,264,0


In [5]:
def get_cutoff_indices(
        data: pd.DataFrame,
        n_features: int,
        step_size: int
) -> list:
    """
    Given a time series data and the number of features and step size, returns a list of tuples containing the start,
    middle and end indices of each sub-sequence.

    :param data: A pandas DataFrame containing the time series data.
    :param n_features: An integer representing the number of features to use in each sub-sequence.
    :param step_size: An integer representing the step size between each sub-sequence.
    :return: A list of tuples containing the start, middle and end indices of each sub-sequence.
    """
    stop_position = len(data) - 1

    # start the first sub-sequence at index position 0
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []

    while subseq_last_idx <= stop_position:
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))

        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indices

In [23]:
# Setting parameters for feature extraction
n_features = 7
step_size = 1

# Compute indices for data split
indices = get_cutoff_indices(df, n_features, step_size)
n_examples = len(indices)

indices[:5]

[(0, 7, 8), (1, 8, 9), (2, 9, 10), (3, 10, 11), (4, 11, 12)]

In [24]:
# Initialize arrays to store the features and targets
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
bookings = []

# Convert time series data to features and targets for the selected location
for i, idx in enumerate(indices):
    x[i, :] = df.iloc[idx[0]:idx[1]]['available'].values
    y[i] = df.iloc[idx[1]]['available']
    bookings.append(df.iloc[idx[1]]['Weekday'])

In [20]:
# Display shape and values of the features and targets for validation
print(f"{x.shape=}")
print(f"{x=}")
print(f"{y=}")
print(f"{bookings[:5]=}")

x.shape=(340, 24)
x=array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)
y=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [25]:
# Create a dataframe for the features
features_df = pd.DataFrame(x,
    columns=[f'bookings_previous_{i+1}_day' for i in reversed(range(n_features))])

features_df

Unnamed: 0,bookings_previous_7_day,bookings_previous_6_day,bookings_previous_5_day,bookings_previous_4_day,bookings_previous_3_day,bookings_previous_2_day,bookings_previous_1_day
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
352,1.0,1.0,1.0,1.0,1.0,1.0,1.0
353,1.0,1.0,1.0,1.0,1.0,1.0,1.0
354,1.0,1.0,1.0,1.0,1.0,1.0,1.0
355,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
# Create a dataframe for the targets
targets_df = pd.DataFrame(y,
    columns=['target_bookings_next_hour'])

targets_df

Unnamed: 0,target_bookings_next_hour
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
352,1.0
353,1.0
354,1.0
355,1.0


For our Airbnb Demand prediction task, we transform our time series dataset into features and targets suitable for training machine learning models. This involves creating sequences or "windows" of historical data as features and the subsequent hour's data as the target.

Parameters used:

* `ts_data`: The entire time series dataset.
* `n_features`: This specifies the length of the historical window we are considering. A value of 7 means we are using the past week (7 days) of data to predict the value of the subsequent day.
* `step_size`: The number of days the window moves forward for each subsequent data point. A value of 1 means we move forward by a day for each step, creating a new feature set for every day in our dataset.

The output consists of two parts:

* `features`: This contains the sequences of historical data. Each row represents a sequence of the past week's availability data, and each column in that row corresponds to a day's data.
* `targets`: This contains the actual availability data for the day immediately following each historical sequence in features.

In [42]:
def transform_ts_data_into_features_and_target(
    ts_data: pd.DataFrame,
    n_features: int,
    step_size: int
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Transforms a time series dataset into features and targets suitable for machine learning.

    Given a time series dataset with columns 'listing_id', 'date', and 'available', this function
    extracts the data for each location, and for each location it creates a set of features and targets. The
    features are created by selecting a window of n_features hours of 'available' data, and the target is the number
    of 'available' in the next hour. The window is moved step_size hours at a time, creating multiple examples. The
    function returns two dataframes: one with the features, and one with the targets.

    :param ts_data: A pandas DataFrame containing the time series data with the expected columns.
    :param n_features: An integer representing the number of features to use in each sub-sequence.
    :param step_size: An integer representing the step size between each sub-sequence.
    :return: A tuple containing two pandas DataFrames: the first one with the features, and the second one with
            the targets.
    """
    # No need to check columns since we assume they are already in the expected format

    location_ids = ts_data["listing_id"].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):
        ts_data_one_location = ts_data.loc[ts_data['listing_id'] == location_id, ['date', 'available']]

        # pre-compute indices for one location
        indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
        n_examples = len(indices)
        
        # transform time series data into features and targets
        x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
        y = np.ndarray(shape=(n_examples))
        pickup_hours = []

        for i, idx in enumerate(indices):
            x[i] = ts_data_one_location.iloc[idx[0]:idx[1]]['available'].values
            y[i] = ts_data_one_location.iloc[idx[2]]['available']
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['date'])

        features_one_location = pd.DataFrame(x, columns=[f'booking_previous_{i+1}_day' for i in reversed(range(n_features))])
        
        features_one_location['date'] = pickup_hours
        features_one_location['listing_id'] = location_id

        targets_one_location = pd.DataFrame(y, columns=["target_booking_next_day"])

        # concatenate features and targets for one location
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets['target_booking_next_day']


In [43]:
# Apply the transformation to the entire dataset
features, targets = transform_ts_data_into_features_and_target(
    ts_data=ts_data,
    n_features=7,
    step_size=1
)

# Display shapes for validation
print(f'{features.shape=}')
print(f'{targets.shape=}')

  0%|          | 0/5627 [00:00<?, ?it/s]

100%|██████████| 5627/5627 [09:59<00:00,  9.38it/s]

features.shape=(2008839, 9)
targets.shape=(2008839,)





In [44]:
# Display the first few rows of the transformed features
features.head(24)

Unnamed: 0,booking_previous_7_day,booking_previous_6_day,booking_previous_5_day,booking_previous_4_day,booking_previous_3_day,booking_previous_2_day,booking_previous_1_day,date,listing_id
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-19,61878
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-20,61878
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-21,61878
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-22,61878
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-23,61878
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-24,61878
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-25,61878
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-26,61878
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-27,61878
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2023-09-28,61878
