# Feature extraction with interpreTS

In this tutorial, we show how you can use interpreTS instead of tsflex to extract features from time series and use them for regression.

In [9]:
import urllib.request as urllib2
from io import BytesIO
from zipfile import ZipFile

import numpy as np
import pandas as pd
import interpreTS as it

Loading in the data

In [45]:
zip_url: str = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
zipped_file_name: str = "household_power_consumption.txt"


df_power_consumption: pd.DataFrame = pd.read_csv(
    ZipFile(BytesIO(urllib2.urlopen(zip_url).read())).open(zipped_file_name),
    sep=";",
    parse_dates={"timestamp": ["Date", "Time"]},
    infer_datetime_format=True,
    low_memory=False,
    na_values=["nan", "?"],
    index_col="timestamp",
    dtype="float32",
)

df_power_consumption.info()

  df_power_consumption: pd.DataFrame = pd.read_csv(
  df_power_consumption: pd.DataFrame = pd.read_csv(


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2075259 entries, 2006-12-16 17:24:00 to 2010-11-26 21:02:00
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Global_active_power    float32
 1   Global_reactive_power  float32
 2   Voltage                float32
 3   Global_intensity       float32
 4   Sub_metering_1         float32
 5   Sub_metering_2         float32
 6   Sub_metering_3         float32
dtypes: float32(7)
memory usage: 71.2 MB


  df_power_consumption: pd.DataFrame = pd.read_csv(


In [46]:
df_power_consumption = df_power_consumption.dropna()
df_power_consumption.index.to_series().diff().value_counts().sample(3)

timestamp
0 days 00:02:00         38
0 days 00:01:00    2049208
0 days 00:34:00          1
Name: count, dtype: int64

In [47]:
df_power_consumption.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.740005,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.679993,15.8,0.0,1.0,17.0


In [48]:
corr = df_power_consumption.corr() * np.tril(
    np.ones(tuple([len(df_power_consumption.columns)] * 2)), k=-1
)

corr.style.background_gradient(cmap="coolwarm", axis=None)

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
Global_active_power,0.0,0.0,-0.0,0.0,0.0,0.0,0.0
Global_reactive_power,0.247017,0.0,-0.0,0.0,0.0,0.0,0.0
Voltage,-0.399762,-0.112246,0.0,-0.0,-0.0,-0.0,-0.0
Global_intensity,0.998889,0.26612,-0.411363,0.0,0.0,0.0,0.0
Sub_metering_1,0.484401,0.123111,-0.195976,0.489298,0.0,0.0,0.0
Sub_metering_2,0.434569,0.139231,-0.167405,0.440347,0.054721,0.0,0.0
Sub_metering_3,0.638555,0.089617,-0.268172,0.626543,0.102571,0.080872,0.0


In [49]:
df_power_consumption["avg_15min_GAP"] = df_power_consumption.rolling("15min")[
    "Global_active_power"
].aggregate(np.nanmean)

  ].aggregate(np.nanmean)


In [55]:
train_columns = [f"Sub_metering_{i}" for i in range(1, 4)] + ["timestamp"]
target_col = "avg_15min_GAP"

# The percentage of data used for testing
test_pct = 0.2
day_margin = 3

# add the timestamp col
df_power_consumption["timestamp"] = df_power_consumption.index

# Ensure timestamp is in datetime format
df_power_consumption['timestamp'] = pd.to_datetime(df_power_consumption['timestamp'])

# Add 'year' and 'month' columns
df_power_consumption['year'] = df_power_consumption['timestamp'].dt.year
df_power_consumption['month'] = df_power_consumption['timestamp'].dt.month

# Add 'adjusted_month' column
df_power_consumption['adjusted_month'] = (df_power_consumption['year'] - 2007) * 12 + df_power_consumption['month']

# Temporal split logic
df_train = df_power_consumption[: -int(len(df_power_consumption) * test_pct)].copy()
df_test = df_power_consumption[df_train.index[-1] + pd.Timedelta(days=day_margin):].copy()

# Add MultiIndex for training data (adjusted_month, timestamp)
df_train = df_train.set_index(['adjusted_month', 'timestamp'])
df_train.sort_index(inplace=True)

# Add MultiIndex for testing data (adjusted_month, timestamp)
df_test = df_test.set_index(['adjusted_month', 'timestamp'])
df_test.sort_index(inplace=True)

# Output the training data head
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,avg_15min_GAP,month
adjusted_month,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12,2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0,4.216,12
12,2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0,4.788,12
12,2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0,4.983333,12
12,2006-12-16 17:27:00,5.388,0.502,233.740005,23.0,0.0,1.0,17.0,5.0845,12
12,2006-12-16 17:28:00,3.666,0.528,235.679993,15.8,0.0,1.0,17.0,4.8008,12


In [70]:
# Reset the index to remove the MultiIndex
df_test_reshaped = df_test.reset_index()
df_train_reshaped = df_train.reset_index()
train_y = df_train_reshaped['avg_15min_GAP']
train_y_monthly = df_train.groupby(level='adjusted_month').mean()
test_y = df_test_reshaped['avg_15min_GAP']
test_y_monthly = df_test.groupby(level='adjusted_month').mean()
df_test_reshaped.drop(columns=['timestamp', 'month', 'avg_15min_GAP'], inplace=True)
df_train_reshaped.drop(columns=['timestamp', 'month', 'avg_15min_GAP'], inplace=True)
train_y.shape, df_train_reshaped.shape, test_y.shape, df_test_reshaped.shape

((1639424,), (1639424, 8), (405537,), (405537, 8))

Feature extraction with interpreTS

In [76]:
extractor = it.FeatureExtractor(id_column="adjusted_month")
features_train = extractor.extract_features(df_train_reshaped)
features_test = extractor.extract_features(df_test_reshaped)
features_train.head()

Unnamed: 0,adjusted_month,length_Global_active_power,length_Global_reactive_power,length_Voltage,length_Global_intensity,length_Sub_metering_1,length_Sub_metering_2,length_Sub_metering_3,mean_Global_active_power,mean_Global_reactive_power,...,spikeness_Sub_metering_1,spikeness_Sub_metering_2,spikeness_Sub_metering_3,seasonality_strength_Global_active_power,seasonality_strength_Global_reactive_power,seasonality_strength_Voltage,seasonality_strength_Global_intensity,seasonality_strength_Sub_metering_1,seasonality_strength_Sub_metering_2,seasonality_strength_Sub_metering_3
0,12,21992,21992,21992,21992,21992,21992,21992,1.901295,0.131386,...,5.637046,5.073887,0.321571,0.88348,0.852107,0.95883,0.891263,0.820514,0.930466,0.980202
1,13,44638,44638,44638,44638,44638,44638,44638,1.546034,0.132676,...,5.441124,5.685272,0.325868,0.901837,0.861384,0.954732,0.902753,0.823161,0.931576,0.980758
2,14,40318,40318,40318,40318,40318,40318,40318,1.401084,0.113637,...,5.517168,5.840772,0.488657,0.94075,0.866933,0.940974,0.941904,0.786104,0.926132,0.982762
3,15,44639,44639,44639,44639,44639,44639,44639,1.318627,0.114747,...,5.099987,4.516423,0.536607,0.945153,0.871813,0.943282,0.944359,0.801939,0.930233,0.979583
4,16,39477,39477,39477,39477,39477,39477,39477,0.891189,0.118778,...,6.5287,8.388329,1.007143,0.929497,0.874644,0.963638,0.927853,0.802403,0.896615,0.978974


Using interpreTS for regression

In [64]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [77]:
gb_regressor = xgb.XGBRegressor(random_state=42)

gb_regressor.fit(features_train, train_y_monthly)

y_pred = gb_regressor.predict(features_test)

rmse = np.sqrt(mean_squared_error(test_y_monthly, y_pred))
print(f"RMSE: {rmse:.4f}")


RMSE: 2.0519
