In [3]:
import pandas as pd
import random
import numpy as np

In [19]:
# Static variables
START_TIME = "2023-01-01 00:00:00"  # Start of the datetime range
END_TIME = "2023-01-01 0:30:00"    # End of the datetime range
FREQ = "7s"                       # Frequency of the datetime stamps ('5min', '1H', etc.)
RANDOM_SEED = 42                    # Seed for reproducibility

# Generate a range of datetime stamps
datetime_range = pd.date_range(start=START_TIME, end=END_TIME, freq=FREQ)

# Create a DataFrame with datetime column
df = pd.DataFrame({"time": datetime_range})

# Randomly remove rows to create non-uniform spacing
np.random.seed(RANDOM_SEED)
drop_indices = np.random.choice(df.index, size=int(len(df) * 0.3), replace=False)  # Remove 30% of the rows
df = df.drop(drop_indices).reset_index(drop=True)

# Add two additional columns with random data
np.random.seed(RANDOM_SEED)  # Re-seed to ensure reproducibility
df["percent_change"] = np.random.rand(len(df)) * 100  # Random numbers between 0 and 100
df["random_column2"] = np.random.randint(1, 50, size=len(df))  # Random integers between 1 and 50

# Show the resulting DataFrame
print(df)

                   time  percent_change  random_column2
0   2023-01-01 00:00:00       37.454012              25
1   2023-01-01 00:00:07       95.071431              23
2   2023-01-01 00:00:21       73.199394              31
3   2023-01-01 00:00:28       59.865848              30
4   2023-01-01 00:00:35       15.601864              42
..                  ...             ...             ...
176 2023-01-01 00:29:03       69.093774              32
177 2023-01-01 00:29:24       38.673535              30
178 2023-01-01 00:29:31       93.672999              47
179 2023-01-01 00:29:45       13.752094              35
180 2023-01-01 00:29:59       34.106635              40

[181 rows x 3 columns]


In [20]:
def shift_column_by_time(df, time_col, value_col, shift_minutes):
    """
    The purpose of this method is to create a shifted set of columns
    that will act as labels downstream model.

    Returns: the same df with an additional column f"{value_col}_label"
    """
    # Ensure 'time_col' is in datetime format
    df[time_col] = pd.to_datetime(df[time_col])
    
    # Sort the DataFrame by time
    df = df.sort_values(by=time_col).reset_index(drop=True)
    
    # Create an empty column for the shifted values
    df[f'{value_col}_label'] = None

    # Iterate over each row and find the appropriate value at least shift_minutes minutes later
    for i in range(len(df)):
        current_time = df.loc[i, time_col]
        future_time = current_time + pd.Timedelta(minutes=shift_minutes)
        
        # Find the first row where the time is greater than or equal to the future_time
        future_row = df[df[time_col] >= future_time]
        if not future_row.empty:
            df.at[i, f'{value_col}_label'] = future_row.iloc[0][value_col]
    
    return df

In [44]:
def find_closest_timestamp(df, time_col, value_col, minutes):
    df.columns
    # Ensure 'time' column is in datetime format
    df.loc[:, time_col] = pd.to_datetime(df[time_col])

    # Create a shifted version of the DataFrame with the target times
    shifted_df = df.copy()
    shifted_df[time_col] = shifted_df[time_col] - pd.Timedelta(minutes=minutes)

    # Merge the original DataFrame with the shifted DataFrame on the closest timestamps
    result_df = pd.merge_asof(df.sort_values(by=time_col),
                              shifted_df.sort_values(by=time_col),
                              on=time_col,
                              direction='forward',
                              suffixes=('', '_label'))

    # Select the required columns and rename them
    result_df = result_df[list(df.columns)+[value_col+'_label']]

    return result_df

In [46]:
new_df = find_closest_timestamp(df, 'time', 'percent_change', 1)
new_df.iloc[-20:]

Unnamed: 0,time,percent_change,random_column2,percent_change_label
161,2023-01-01 00:26:43,63.230583,49,59.089294
162,2023-01-01 00:26:50,63.352971,42,67.756436
163,2023-01-01 00:26:57,53.577468,6,1.658783
164,2023-01-01 00:27:04,9.028977,15,51.209306
165,2023-01-01 00:27:11,83.53025,43,22.649578
166,2023-01-01 00:27:25,32.078006,37,64.517279
167,2023-01-01 00:27:32,18.651851,33,64.517279
168,2023-01-01 00:27:39,4.077514,8,64.517279
169,2023-01-01 00:27:46,59.089294,44,17.436643
170,2023-01-01 00:27:53,67.756436,44,17.436643


In [47]:
new_df2 = shift_column_by_time(df, 'time', 'percent_change', 1)
new_df2.iloc[-20:]

Unnamed: 0,time,percent_change,random_column2,percent_change_label
161,2023-01-01 00:26:43,63.230583,49,59.089294
162,2023-01-01 00:26:50,63.352971,42,67.756436
163,2023-01-01 00:26:57,53.577468,6,1.658783
164,2023-01-01 00:27:04,9.028977,15,51.209306
165,2023-01-01 00:27:11,83.53025,43,22.649578
166,2023-01-01 00:27:25,32.078006,37,64.517279
167,2023-01-01 00:27:32,18.651851,33,64.517279
168,2023-01-01 00:27:39,4.077514,8,64.517279
169,2023-01-01 00:27:46,59.089294,44,17.436643
170,2023-01-01 00:27:53,67.756436,44,17.436643
