In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv(r'..\dat\walks\random_walk_180min.csv')
#val_data = pd.read_csv(r'..\dat\walks\random_walk_300min.csv')

In [3]:
# get the mean difference of the "time" column
mean_diff = train_data['time'].diff().mean()
mean_diff

0.1052681461683026

In [4]:
train_data["finestep_norm"] = 2 * (train_data['finestep'] + np.power(2, 15)) / (2 * np.power(2, 15)) - 1

In [5]:
[int(x * 1 / mean_diff) for x in [2, 8, 16, 32]]

[18, 75, 151, 303]

In [7]:
# set the time column as the index
train_data["time"] = pd.to_datetime(train_data["time"], unit='s')
train_data.set_index('time', inplace=True)
train_data

Unnamed: 0_level_0,finestep,c_0,c_1,c_2,c_3,c_4,temp,c_mean,finestep_norm
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1970-01-01 00:00:00.000000000,-9382,14.0,14.0,14.0,14.0,17.0,20.5,14.6,-0.286316
1970-01-01 00:00:00.511491298,-9436,14.0,14.0,17.0,16.0,14.0,20.5,15.0,-0.287964
1970-01-01 00:00:00.607235193,-9489,16.0,17.0,14.0,14.0,14.0,20.5,15.0,-0.289581
1970-01-01 00:00:00.702979087,-9545,16.0,17.0,15.0,12.0,15.0,20.5,15.0,-0.291290
1970-01-01 00:00:00.798724174,-9606,16.0,15.0,14.0,14.0,15.0,20.5,14.8,-0.293152
...,...,...,...,...,...,...,...,...,...
1970-01-01 03:09:28.528164386,25927,-91.0,-94.0,-93.0,-90.0,-91.0,21.0,-91.8,0.791229
1970-01-01 03:09:28.624685287,25896,-93.0,-90.0,-87.0,-94.0,-94.0,21.0,-91.6,0.790283
1970-01-01 03:09:28.736151456,25861,-91.0,-93.0,-92.0,-91.0,-93.0,21.0,-92.0,0.789215
1970-01-01 03:09:28.847842931,25821,-93.0,-90.0,-90.0,-92.0,-93.0,21.0,-91.6,0.787994


In [55]:
import pandas as pd

def generate_features(df, column_name='finestep', thresholds=[0.75, -0.75]):
    """Generates features from a Pandas DataFrame containing a time series.

    Args:
        df (pd.DataFrame): DataFrame containing the time series data.
        column_name (str): Name of the column containing the signal (default: 'voltage').
        thresholds (list): List of thresholds for above/below time calculations (default: [0.75, -0.75]).

    Returns:
        pd.DataFrame: DataFrame with the original data and the generated features.
    """
    df["time"] = pd.to_datetime(df["time"], unit='s')
    df.set_index('time', inplace=True)

    #mean_diff = train_data['time'].diff().mean()
    timeframes = [2, 8, 16, 32]
    df_features = pd.DataFrame(index=df.index)
    df_features[column_name] = df[column_name]
    df_features['c_mean'] = df['c_mean']
    df_features['delta'] = df_features['c_mean'].diff().shift(-1)

    for tf in timeframes:
        window = pd.to_timedelta(f'{tf}min')  # Convert minutes to timedelta
        #window = int(tf * 1 / mean_diff)

        # Rolling Statistics
        df_features[f'rolling_mean_{tf}min'] = df[column_name].rolling(window).mean()
        df_features[f'rolling_max_{tf}min'] = df[column_name].rolling(window).max()
        df_features[f'rolling_min_{tf}min'] = df[column_name].rolling(window).min()

        # Peaks and Valleys (assuming your index is a DatetimeIndex)
        peaks = df[column_name][(df[column_name].shift(1) < df[column_name]) & (df[column_name].shift(-1) < df[column_name])]
        valleys = df[column_name][(df[column_name].shift(1) > df[column_name]) & (df[column_name].shift(-1) > df[column_name])]

        df_features[f'time_since_peak_{tf}min'] = df.index.to_series().map(lambda x: (x - peaks[peaks.index <= x].index.max()).total_seconds() / 60 if not peaks[peaks.index <= x].empty else float('nan'))
        # scale df[f'time_since_peak_{tf}min'] so that it is between 0 and 1, 1 being the maximum time since peak
        #df_features[f'time_since_peak_{tf}min'] = df_features[f'time_since_peak_{tf}min'] / tf
        df_features[f'time_since_valley_{tf}min'] = df.index.to_series().map(lambda x: (x - valleys[valleys.index <= x].index.max()).total_seconds() / 60 if not valleys[valleys.index <= x].empty else float('nan'))
        # scale df[f'time_since_valley_{tf}min'] so that it is between 0 and 1, 1 being the maximum time since valley
        #df_features[f'time_since_valley_{tf}min'] = df_features[f'time_since_valley_{tf}min'] / tf
        # Time Above/Below Threshold
        for threshold in thresholds:
            above_threshold = df[column_name] > threshold
            df_features[f'time_above_{threshold}_{tf}min'] = above_threshold.rolling(window).sum() * df.index.to_series().diff().dt.total_seconds() / 60
            # scale df[f'time_above_{threshold}_{tf}min'] so that it is between 0 and 1, 1 being the maximum time above threshold
            #df_features[f'time_above_{threshold}_{tf}min'] = df_features[f'time_above_{threshold}_{tf}min'] / tf
            below_threshold = df[column_name] < threshold
            df_features[f'time_below_{threshold}_{tf}min'] = below_threshold.rolling(window).sum() * df.index.to_series().diff().dt.total_seconds() / 60
            # scale df[f'time_below_{threshold}_{tf}min'] so that it is between 0 and 1, 1 being the maximum time below threshold
            #df_features[f'time_below_{threshold}_{tf}min'] = df_features[f'time_below_{threshold}_{tf}min'] / tf

    return df_features

In [56]:
train = train_data.copy()
train = generate_features(train)
train


In [None]:
X_train = train.drop(columns=['delta', 'finestep', 'c_mean'])
y_train = train['delta']

X_train, y_train

In [37]:
val_data = pd.read_csv(r'..\dat\walks\random_walk_300min.csv')
# set the time column as the index
val_data["time"] = pd.to_datetime(val_data["time"], unit='s')
val_data.set_index('time', inplace=True)
X_val = val_data[0:int(0.3 * len(val_data))].copy()
X_val = generate_features(X_val)
X_val

Unnamed: 0_level_0,rolling_mean_2min,rolling_max_2min,rolling_min_2min,time_since_peak_2min,time_since_valley_2min,time_above_0.75_2min,time_below_0.75_2min,time_above_-0.75_2min,time_below_-0.75_2min,rolling_mean_8min,...,time_below_-0.75_16min,rolling_mean_32min,rolling_max_32min,rolling_min_32min,time_since_peak_32min,time_since_valley_32min,time_above_0.75_32min,time_below_0.75_32min,time_above_-0.75_32min,time_below_-0.75_32min
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-01 00:00:00.000000000,-22036.000000,-22036.0,-22036.0,,,,,,,-22036.000000,...,,-22036.000000,-22036.0,-22036.0,,,,,,
1970-01-01 00:00:00.363215684,-21966.500000,-21897.0,-22036.0,,,0.0,0.012107,0.0,0.012107,-21966.500000,...,0.012107,-21966.500000,-21897.0,-22036.0,,,0.000000,0.012107,0.000000,0.012107
1970-01-01 00:00:00.472598314,-21897.666667,-21760.0,-22036.0,,,0.0,0.005469,0.0,0.005469,-21897.666667,...,0.005469,-21897.666667,-21760.0,-22036.0,,,0.000000,0.005469,0.000000,0.005469
1970-01-01 00:00:00.589865207,-21829.500000,-21625.0,-22036.0,,,0.0,0.007818,0.0,0.007818,-21829.500000,...,0.007818,-21829.500000,-21625.0,-22036.0,,,0.000000,0.007818,0.000000,0.007818
1970-01-01 00:00:00.698869705,-21762.000000,-21492.0,-22036.0,,,0.0,0.009084,0.0,0.009084,-21762.000000,...,0.009084,-21762.000000,-21492.0,-22036.0,,,0.000000,0.009084,0.000000,0.009084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:10:43.951942920,-26334.714286,-20652.0,-29368.0,0.080399,0.501763,0.0,2.008364,0.0,2.008364,-20138.366428,...,14.233675,-3731.871750,30481.0,-31431.0,0.080399,0.501763,13.574157,19.096226,13.574157,19.096226
1970-01-01 02:10:44.063125133,-26337.996289,-20652.0,-29368.0,0.082252,0.503616,0.0,1.997574,0.0,1.997574,-20139.383514,...,14.157202,-3732.993499,30481.0,-31431.0,0.082252,0.503616,13.501227,18.993628,13.501227,18.993628
1970-01-01 02:10:44.175300598,-26341.364564,-20652.0,-29368.0,0.084122,0.505485,0.0,2.015419,0.0,2.015419,-20140.403602,...,14.283676,-3734.120951,30481.0,-31431.0,0.084122,0.505485,13.621841,19.163309,13.621841,19.163309
1970-01-01 02:10:44.287312984,-26344.811688,-20652.0,-29368.0,0.085989,0.507352,0.0,2.012489,0.0,2.012489,-20141.424613,...,14.262910,-3735.253764,30481.0,-31431.0,0.085989,0.507352,13.602037,19.135449,13.602037,19.135449


In [39]:
y_val = val_data[0:int(0.3 * len(val_data))]['c_mean'].diff().shift(-1).dropna()
y_val

time
1970-01-01 00:00:00.000000000    3.6
1970-01-01 00:00:00.363215684   -0.8
1970-01-01 00:00:00.472598314    0.2
1970-01-01 00:00:00.589865207   -0.2
1970-01-01 00:00:00.698869705    0.6
                                ... 
1970-01-01 01:37:53.793154954    0.4
1970-01-01 01:37:53.905199050   -0.6
1970-01-01 01:37:54.016992807    1.6
1970-01-01 01:37:54.144671916   -1.0
1970-01-01 01:37:54.256468534    0.4
Name: c_mean, Length: 53961, dtype: float64

In [54]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train.dropna(), y_train.dropna())

ValueError: Found input variables with inconsistent numbers of samples: [107713, 108000]

In [None]:
# Make predictions on the validation data
result = model.predict(X_val)

# Calculate the mean absolute error of your Linear Regression model on the validation data
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_val, result)
mae