# Machine Learning examples with interpreTS

This notebook shows more training models usuing interpreTS in version 0.4.1

In [2]:
import pandas as pd
import numpy as np
import interpreTS as it
from sktime.datasets import load_airline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

### Regression with interpreTS and Multiple ML Models

In [3]:
# Load dataset
data = load_airline()
data = data.reset_index()
data.columns = ["timestamp", "passengers"]
data["timestamp"] = data["timestamp"].dt.to_timestamp()
data.set_index("timestamp", inplace=True)

In [4]:
print(data.head())

            passengers
timestamp             
1949-01-01       112.0
1949-02-01       118.0
1949-03-01       132.0
1949-04-01       129.0
1949-05-01       121.0


In [5]:
# Add time-based features
data["year"] = data.index.year
data["month"] = data.index.month
data["adjusted_month"] = (data["year"] - data["year"].min()) * 12 + data["month"]
print(data.head())

            passengers  year  month  adjusted_month
timestamp                                          
1949-01-01       112.0  1949      1               1
1949-02-01       118.0  1949      2               2
1949-03-01       132.0  1949      3               3
1949-04-01       129.0  1949      4               4
1949-05-01       121.0  1949      5               5


In [6]:
# Add lagged target variable
data["target"] = data["passengers"].shift(-12)  # Predict passengers 12 months ahead
data = data.dropna()
print(data)

            passengers  year  month  adjusted_month  target
timestamp                                                  
1949-01-01       112.0  1949      1               1   115.0
1949-02-01       118.0  1949      2               2   126.0
1949-03-01       132.0  1949      3               3   141.0
1949-04-01       129.0  1949      4               4   135.0
1949-05-01       121.0  1949      5               5   125.0
...                ...   ...    ...             ...     ...
1959-08-01       559.0  1959      8             128   606.0
1959-09-01       463.0  1959      9             129   508.0
1959-10-01       407.0  1959     10             130   461.0
1959-11-01       362.0  1959     11             131   390.0
1959-12-01       405.0  1959     12             132   432.0

[132 rows x 5 columns]


----------------------------------------------

In [7]:
# Feature extraction
extractor = it.FeatureExtractor(
    id_column="adjusted_month",
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
        it.Features.SPIKENESS,
        it.Features.TREND_STRENGTH,
        it.Features.SEASONALITY_STRENGTH,
        it.Features.LINEARITY,
        it.Features.ABSOLUTE_ENERGY,
        it.Features.MEAN_CHANGE,
    ],
)

In [8]:
# Extract features for the full dataset
numeric_columns = data.select_dtypes(include=["number"]).columns
data_numeric = data[numeric_columns]
features = extractor.extract_features(data_numeric, mode="sequential")

In [9]:
print(features)

   mean_passengers  mean_year  mean_month  mean_target  variance_passengers  \
0       262.492424     1954.0         6.5   294.265152         11369.061011   

   variance_year  variance_month  variance_target  entropy_passengers  \
0      10.076336       12.007634     13335.814654            0.964481   

   entropy_year  ...  linearity_month  linearity_target  \
0      0.998999  ...              0.0           0.00025   

   absolute_energy_passengers  absolute_energy_year  absolute_energy_month  \
0                  10584447.0             503992632                   7150   

   absolute_energy_target                             mean_change_passengers  \
0              13177133.0  timestamp
1949-01-01     NaN
1949-02-01     Na...   

                                    mean_change_year  \
0  timestamp
1949-01-01    NaN
1949-02-01    NaN
...   

                                   mean_change_month  \
0  timestamp
1949-01-01    NaN
1949-02-01    NaN
...   

                               

In [11]:
#Maping columns name
try:
    feature_columns = [f"{feature}_series" for feature in extractor.features]
    features_df = pd.DataFrame(features, columns=feature_columns)
except ValueError as e:
    print("Błąd mapowania nazw kolumn:", e)
    # Debug: Wyświetlenie rozmiaru danych i wymaganych nazw kolumn
    print(f"Rozmiar danych: {features.shape}, liczba kolumn: {len(extractor.features)}")

In [12]:
print(feature_columns)

['mean_series', 'variance_series', 'entropy_series', 'spikeness_series', 'trend_strength_series', 'seasonality_strength_series', 'linearity_series', 'absolute_energy_series', 'mean_change_series']


In [13]:
print(features_df)

   mean_series  variance_series  entropy_series  spikeness_series  \
0          NaN              NaN             NaN               NaN   

   trend_strength_series  seasonality_strength_series  linearity_series  \
0                    NaN                          NaN               NaN   

   absolute_energy_series  mean_change_series  
0                     NaN                 NaN  


In [14]:
scaler = StandardScaler()
try:
    features_scaled = scaler.fit_transform(features)
except ValueError as e:
    print("Błąd skalowania:", e)

Błąd skalowania: setting an array element with a sequence.


In [15]:
test_pct = 0.2
split_index = int(len(features) * (1 - test_pct))
features_train, features_test = features[:split_index], features[split_index:]
train_y, test_y = data["target"].iloc[:split_index].reset_index(drop=True), data["target"].iloc[split_index:].reset_index(drop=True)

In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Support Vector Regression": SVR(kernel="rbf"),
    "XGBoost": XGBRegressor(random_state=42),
}

In [17]:
results = {}
for model_name, model in models.items():
    try:
        model.fit(features_train, train_y)
        predictions = model.predict(features_test)
        rmse = np.sqrt(mean_squared_error(test_y, predictions))
        results[model_name] = rmse
    except ValueError as e:
        print(f"Błąd podczas trenowania modelu {model_name}:", e)

print("Wyniki RMSE:")
for model, rmse in results.items():
    print(f"{model}: {rmse:.4f}")

Błąd podczas trenowania modelu Linear Regression: Found array with 0 sample(s) (shape=(0, 36)) while a minimum of 1 is required by LinearRegression.
Błąd podczas trenowania modelu Random Forest: Found array with 0 sample(s) (shape=(0, 36)) while a minimum of 1 is required by RandomForestRegressor.
Błąd podczas trenowania modelu Support Vector Regression: Found array with 0 sample(s) (shape=(0, 36)) while a minimum of 1 is required by SVR.
Błąd podczas trenowania modelu XGBoost: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:mean_change_passengers: object, mean_change_year: object, mean_change_month: object, mean_change_target: object
Wyniki RMSE:


In [18]:
import pandas as pd
import numpy as np
import interpreTS as it

# Minimalny zestaw danych
df_test = pd.DataFrame({
    "adjusted_month": [1, 1, 1, 2, 2, 2],
    "value": [10, 20, 30, 40, 50, 60]
})

# Tworzenie obiektu FeatureExtractor
extractor = it.FeatureExtractor(id_column="adjusted_month", features=[
    it.Features.MEAN,
    it.Features.VARIANCE,
    it.Features.ENTROPY
])

# Wywołanie extract_features
features = extractor.extract_features(df_test)

print(features)

   mean_value  variance_value  entropy_value
0        35.0           350.0        0.99857


In [19]:
print(data_numeric.info())
print(data_numeric.head())
print(data_numeric.isna().sum())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 132 entries, 1949-01-01 to 1959-12-01
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   passengers      132 non-null    float64
 1   year            132 non-null    int32  
 2   month           132 non-null    int32  
 3   adjusted_month  132 non-null    int32  
 4   target          132 non-null    float64
dtypes: float64(2), int32(3)
memory usage: 4.6 KB
None
            passengers  year  month  adjusted_month  target
timestamp                                                  
1949-01-01       112.0  1949      1               1   115.0
1949-02-01       118.0  1949      2               2   126.0
1949-03-01       132.0  1949      3               3   141.0
1949-04-01       129.0  1949      4               4   135.0
1949-05-01       121.0  1949      5               5   125.0
passengers        0
year              0
month             0
adjusted_month    0
target 

In [20]:
print(data_numeric.groupby("adjusted_month").size())

adjusted_month
1      1
2      1
3      1
4      1
5      1
      ..
128    1
129    1
130    1
131    1
132    1
Length: 132, dtype: int64


In [21]:
extractor = it.FeatureExtractor(
    id_column="adjusted_month",
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_numeric)
print(features)


   mean_passengers  mean_year  mean_month  mean_target  variance_passengers  \
0       262.492424     1954.0         6.5   294.265152         11369.061011   

   variance_year  variance_month  variance_target  entropy_passengers  \
0      10.076336       12.007634     13335.814654            0.964481   

   entropy_year  entropy_month  entropy_target  
0      0.998999       0.998891        0.962286  


In [25]:
# Wnioski
# adjusted_month zostało użyte jako id_column, ale liczba unikalnych wartości to 132 (każdy wiersz ma swoją unikalną wartość). To prowadzi do sytuacji, w której każda grupa ma dokładnie jeden wiersz.
# ENTROPY może być źle interpretowana, jeśli dane nie mają wystarczającej liczby różnych wartości w grupie.
# Kod nie zgłasza błędów ani ostrzeżeń, mimo że dane wejściowe nie pasują do logiki działania funkcji. To może prowadzić do mylnych interpretacji wyników, a tu np wybrane id_column nie ma sensu, można dodać coś do sparwdzania wyboru id_column
#

In [26]:
# Grup by month
data_numeric["group_month"] = data_numeric.index.month

extractor = it.FeatureExtractor(
    id_column="group_month",
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_numeric)
print(features)


   mean_passengers  mean_year  mean_month  mean_adjusted_month  mean_target  \
0       262.492424     1954.0         6.5                 66.5   294.265152   

   variance_passengers  variance_year  variance_month  \
0         11369.061011      10.076336       12.007634   

   variance_adjusted_month  variance_target  entropy_passengers  entropy_year  \
0                   1463.0     13335.814654            0.964481      0.998999   

   entropy_month  entropy_adjusted_month  entropy_target  
0       0.998891                0.997381        0.962286  


In [28]:
data_uniform = data_numeric.copy()
data_uniform["passengers"] = 100  # the same values

extractor = it.FeatureExtractor(
    id_column="year",  # Grup by year
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_uniform)
print(features)


   mean_passengers  mean_month  mean_adjusted_month  mean_target  \
0            100.0         6.5                 66.5   294.265152   

   mean_group_month  variance_passengers  variance_month  \
0               6.5                  0.0       12.007634   

   variance_adjusted_month  variance_target  variance_group_month  \
0                   1463.0     13335.814654             12.007634   

   entropy_passengers  entropy_month  entropy_adjusted_month  entropy_target  \
0                 0.0       0.998891                0.997381        0.962286   

   entropy_group_month  
0             0.998891  


In [29]:
import numpy as np

# Random data
data_random = data_numeric.copy()
data_random["passengers"] = np.random.randint(50, 200, size=len(data_random))

extractor = it.FeatureExtractor(
    id_column="month",  # Grupo by month
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_random)
print(features)


   mean_passengers  mean_year  mean_adjusted_month  mean_target  \
0       124.401515     1954.0                 66.5   294.265152   

   mean_group_month  variance_passengers  variance_year  \
0               6.5          1782.608547      10.076336   

   variance_adjusted_month  variance_target  variance_group_month  \
0                   1463.0     13335.814654             12.007634   

   entropy_passengers  entropy_year  entropy_adjusted_month  entropy_target  \
0            0.994492      0.998999                0.997381        0.962286   

   entropy_group_month  
0             0.998891  


In [30]:
# nan values
data_with_nan = data_numeric.copy()
data_with_nan.loc[data_with_nan.index.month == 5, "passengers"] = np.nan  

extractor = it.FeatureExtractor(
    id_column="month",
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_with_nan)
print(features)


  mean_passengers  mean_year  mean_adjusted_month  mean_target  \
0            <NA>     1954.0                 66.5   294.265152   

   mean_group_month variance_passengers  variance_year  \
0               6.5                <NA>      10.076336   

   variance_adjusted_month  variance_target  variance_group_month  \
0                   1463.0     13335.814654             12.007634   

  entropy_passengers  entropy_year  entropy_adjusted_month  entropy_target  \
0               <NA>      0.998999                0.997381        0.962286   

   entropy_group_month  
0             0.998891  


In [None]:
# Wnioski: Wyświetlać bardziej szczegółowe komunikaty o błędach lub ostrzeżenia...
# Dodać obsługę brakujących danych: np. opcję ignorowania NaN (skipna=True) w obliczeniach.

In [31]:
# new features
data_extended = data_numeric.copy()
data_extended["feature_1"] = data_numeric["passengers"] * 1.1
data_extended["feature_2"] = np.log1p(data_numeric["passengers"])

extractor = it.FeatureExtractor(
    id_column="year",  # Grup by year
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_extended)
print(features)


   mean_passengers  mean_month  mean_adjusted_month  mean_target  \
0       262.492424         6.5                 66.5   294.265152   

   mean_group_month  mean_feature_1  mean_feature_2  variance_passengers  \
0               6.5      288.741667        5.491038         11369.061011   

   variance_month  variance_adjusted_month  ...  variance_group_month  \
0       12.007634                   1463.0  ...             12.007634   

   variance_feature_1  variance_feature_2  entropy_passengers  entropy_month  \
0        13756.563823            0.171701            0.964481       0.998891   

   entropy_adjusted_month  entropy_target  entropy_group_month  \
0                0.997381        0.962286             0.998891   

   entropy_feature_1  entropy_feature_2  
0           0.964481           0.986172  

[1 rows x 21 columns]


In [32]:
# non normal type od id_column
extractor = it.FeatureExtractor(
    id_column="passengers",  
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(data_numeric)
print(features)


   mean_year  mean_month  mean_adjusted_month  mean_target  mean_group_month  \
0     1954.0         6.5                 66.5   294.265152               6.5   

   variance_year  variance_month  variance_adjusted_month  variance_target  \
0      10.076336       12.007634                   1463.0     13335.814654   

   variance_group_month  entropy_year  entropy_month  entropy_adjusted_month  \
0             12.007634      0.998999       0.998891                0.997381   

   entropy_target  entropy_group_month  
0        0.962286             0.998891  


In [None]:
# Wnioski: Brak walidacji logicznej kolumny grupującej (id_column), co może prowadzić do błędnych interpretacji wyników.

In [33]:
# test small data
small_data = data_numeric.head(5)

extractor = it.FeatureExtractor(
    id_column="month",
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
    ],
)
features = extractor.extract_features(small_data)
print(features)


   mean_passengers  mean_year  mean_adjusted_month  mean_target  \
0            122.4     1949.0                  3.0        128.4   

   mean_group_month  variance_passengers  variance_year  \
0               3.0                 66.3            0.0   

   variance_adjusted_month  variance_target  variance_group_month  \
0                      2.5             99.8                   2.5   

   entropy_passengers  entropy_year  entropy_adjusted_month  entropy_target  \
0            0.999145           0.0                0.998799        0.997961   

   entropy_group_month  
0             0.998799  


In [132]:
# Select relevant features
selected_columns = [
    "mean_passengers", "variance_passengers", "entropy_passengers",
    "spikeness_passengers", "trend_strength_passengers"
]
features = pd.DataFrame(features)[selected_columns]

# Verify extracted features
print(features)

   mean_passengers  variance_passengers  entropy_passengers  \
0       262.492424         11369.061011            0.964481   

   spikeness_passengers  trend_strength_passengers  
0              0.570796                   0.845783  


In [133]:
# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
print(features_scaled)

[[0. 0. 0. 0. 0.]]


In [134]:
# Ensure target variable matches features
data = data.iloc[:len(features)]

In [135]:
# Train/test split (AFTER feature extraction)
test_pct = 0.2
split_index = int(len(features) * (1 - test_pct))
features_train, features_test = features[:split_index], features[split_index:]
train_y, test_y = data["target"].iloc[:split_index].reset_index(drop=True), data["target"].iloc[split_index:].reset_index(drop=True)

In [138]:
print(train_y)

Series([], Name: target, dtype: float64)


In [None]:
# Results of tests: ...

--------------------------------------------------------

In [7]:
# Define sliding window for feature extraction
window_size = 12  # 12 months (one year)
sequences = []

In [8]:
# Create sliding windows
for start_idx in range(len(data) - window_size + 1):
    window = data.iloc[start_idx:start_idx + window_size].copy()
    sequences.append(window)

In [9]:
# Extract features
extractor = it.FeatureExtractor(
    id_column="adjusted_month",
    features=[
        it.Features.MEAN,
        it.Features.VARIANCE,
        it.Features.ENTROPY,
        it.Features.SPIKENESS,
        it.Features.TREND_STRENGTH,
        it.Features.SEASONALITY_STRENGTH,
        it.Features.LINEARITY,
        it.Features.ABSOLUTE_ENERGY,
        it.Features.MEAN_CHANGE,
    ],
)

In [10]:
# Extract features from each sequence
extracted_features = []
for sequence in sequences:
    numeric_columns = sequence.select_dtypes(include=["number"]).columns
    sequence_numeric = sequence[numeric_columns]
    features = extractor.extract_features(sequence_numeric, mode="sequential")
    extracted_features.append(features)

# Flatten extracted_features to ensure it's 2D
extracted_features_flattened = np.vstack(extracted_features)

In [13]:
print("Shape of extracted features:", extracted_features_flattened.shape)
print("Number of features in extractor:", len(extractor.features))

Shape of extracted features: (121, 36)
Number of features in extractor: 9


In [15]:
# Verify the shape of extracted features
print("Shape of extracted features:", extracted_features_flattened.shape)

# Dynamically generate column names to match the data shape
feature_columns = [f"feature_{i}" for i in range(extracted_features_flattened.shape[1])]

# Create DataFrame with dynamically generated column names
features = pd.DataFrame(extracted_features_flattened, columns=feature_columns)

# Verify the DataFrame
print(features.head())

Shape of extracted features: (121, 36)
    feature_0    feature_1 feature_2   feature_3   feature_4 feature_5  \
0  126.666667       1949.0       6.5  139.666667  188.242424       0.0   
1  126.916667  1949.083333       6.5  142.166667  180.992424  0.083333   
2  127.583333  1949.166667       6.5  144.166667  173.356061  0.151515   
3  128.333333      1949.25       6.5      147.25  187.333333  0.204545   
4  128.833333  1949.333333       6.5  149.583333  191.060606  0.242424   

  feature_6   feature_7 feature_8 feature_9  ... feature_26 feature_27  \
0      13.0   363.69697   0.99429       0.0  ...        0.0   0.059659   
1      13.0  304.151515  0.992977  0.841106  ...       0.25   0.018318   
2      13.0  281.606061  0.990952   0.92805  ...       0.16   0.000775   
3      13.0  374.386364  0.993914  0.966266  ...       0.09    0.03121   
4      13.0  377.356061  0.994544  0.984764  ...       0.04    0.01088   

  feature_28 feature_29 feature_30 feature_31  \
0   194604.0   4558321

In [16]:
print(extracted_features_flattened)

[[126.66666666666667 1949.0 6.5 ... timestamp
                                    1949-01-01    NaN
                                    1949-02-01    NaN
                                    1949-03-01    NaN
                                    1949-04-01    NaN
                                    1949-05-01    NaN
                                    1949-06-01    0.0
                                    1949-07-01    0.0
                                    1949-08-01    0.0
                                    1949-09-01    0.0
                                    1949-10-01    0.0
                                    1949-11-01    0.0
                                    1949-12-01    0.0
                                    Name: year, dtype: float64
  timestamp
  1949-01-01    NaN
  1949-02-01    NaN
  1949-03-01    NaN
  1949-04-01    NaN
  1949-05-01    NaN
  1949-06-01    1.0
  1949-07-01    1.0
  1949-08-01    1.0
  1949-09-01    1.0
  1949-10-01    1.0
  1949-11-01    1.0
  1949-12-0

In [18]:
# Type od data in extracted_features_flattened
for col_idx in range(extracted_features_flattened.shape[1]):
    unique_types = set(type(val) for val in extracted_features_flattened[:, col_idx])
    print(f"Column {col_idx} types: {unique_types}")

Column 0 types: {<class 'float'>}
Column 1 types: {<class 'float'>}
Column 2 types: {<class 'float'>}
Column 3 types: {<class 'float'>}
Column 4 types: {<class 'float'>}
Column 5 types: {<class 'float'>}
Column 6 types: {<class 'float'>}
Column 7 types: {<class 'float'>}
Column 8 types: {<class 'float'>}
Column 9 types: {<class 'float'>}
Column 10 types: {<class 'float'>}
Column 11 types: {<class 'float'>}
Column 12 types: {<class 'float'>}
Column 13 types: {<class 'int'>, <class 'float'>}
Column 14 types: {<class 'float'>}
Column 15 types: {<class 'float'>}
Column 16 types: {<class 'float'>}
Column 17 types: {<class 'float'>}
Column 18 types: {<class 'float'>}
Column 19 types: {<class 'float'>}
Column 20 types: {<class 'float'>}
Column 21 types: {<class 'float'>}
Column 22 types: {<class 'float'>}
Column 23 types: {<class 'float'>}
Column 24 types: {<class 'float'>}
Column 25 types: {<class 'float'>}
Column 26 types: {<class 'float'>}
Column 27 types: {<class 'float'>}
Column 28 types

In [19]:
# Coversion nummber
for col_idx in range(extracted_features_flattened.shape[1]):
    try:
        extracted_features_flattened[:, col_idx] = extracted_features_flattened[:, col_idx].astype(float)
    except ValueError:
        print(f"Cannot convert column {col_idx} to float")

Cannot convert column 32 to float
Cannot convert column 33 to float
Cannot convert column 34 to float
Cannot convert column 35 to float


In [20]:
# Data in sequences
for idx, sequence in enumerate(sequences):
    print(f"Sequence {idx} info:")
    print(sequence.info())


Sequence 0 info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12 entries, 1949-01-01 to 1949-12-01
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   passengers      12 non-null     float64
 1   year            12 non-null     int32  
 2   month           12 non-null     int32  
 3   adjusted_month  12 non-null     int32  
 4   target          12 non-null     float64
dtypes: float64(2), int32(3)
memory usage: 432.0 bytes
None
Sequence 1 info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12 entries, 1949-02-01 to 1950-01-01
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   passengers      12 non-null     float64
 1   year            12 non-null     int32  
 2   month           12 non-null     int32  
 3   adjusted_month  12 non-null     int32  
 4   target          12 non-null     float64
dtypes: float64(2), int32(3)
mem

In [21]:
print(f"Shape of extracted features: {extracted_features_flattened.shape}")
print(f"Sample of extracted features: {extracted_features_flattened[:5]}")


Shape of extracted features: (121, 36)
Sample of extracted features: [[126.66666666666667 1949.0 6.5 139.66666666666666 188.24242424242428 0.0
  13.0 363.6969696969697 0.9942904703154525 0.0 0.9978560377849336
  0.9946432601505386 0.17905526682819217 0.0 0.0 0.3896679151782587
  0.00040863836742780716 0.0 1.0 0.06376734413728247 0.0 0.0
  0.506993006993007 0.0 0.11716881874701102 0.0 0.0 0.05965855023789546
  194604.0 45583212.0 650.0 238082.0 timestamp
                                     1949-01-01    NaN
                                     1949-02-01    NaN
                                     1949-03-01    NaN
                                     1949-04-01    NaN
                                     1949-05-01    NaN
                                     1949-06-01    4.6
                                     1949-07-01    6.0
                                     1949-08-01    3.2
                                     1949-09-01    1.4
                                     1949-10-01

In [22]:
# Sprawdź dane i konwertuj kolumny do float, jeśli możliwe
cleaned_features = []
for col_idx in range(extracted_features_flattened.shape[1]):
    try:
        column = extracted_features_flattened[:, col_idx].astype(float)
        cleaned_features.append(column)
    except ValueError:
        print(f"Skipping column {col_idx} due to non-numeric values")

# Przekształć do numpy array i stwórz DataFrame
cleaned_features = np.array(cleaned_features).T  # Transpozycja, aby uzyskać poprawny kształt
feature_columns = [f"feature_{i}" for i in range(cleaned_features.shape[1])]

# Utwórz DataFrame
features_cleaned = pd.DataFrame(cleaned_features, columns=feature_columns)

# Zweryfikuj wyniki
print(features_cleaned.head())


Skipping column 32 due to non-numeric values
Skipping column 33 due to non-numeric values
Skipping column 34 due to non-numeric values
Skipping column 35 due to non-numeric values
    feature_0    feature_1  feature_2   feature_3   feature_4  feature_5  \
0  126.666667  1949.000000        6.5  139.666667  188.242424   0.000000   
1  126.916667  1949.083333        6.5  142.166667  180.992424   0.083333   
2  127.583333  1949.166667        6.5  144.166667  173.356061   0.151515   
3  128.333333  1949.250000        6.5  147.250000  187.333333   0.204545   
4  128.833333  1949.333333        6.5  149.583333  191.060606   0.242424   

   feature_6   feature_7  feature_8  feature_9  ...  feature_22  feature_23  \
0       13.0  363.696970   0.994290   0.000000  ...    0.506993         0.0   
1       13.0  304.151515   0.992977   0.841106  ...    0.199301         0.0   
2       13.0  281.606061   0.990952   0.928050  ...    0.000000         0.0   
3       13.0  374.386364   0.993914   0.966266 

In [26]:
print(f"Number of features in extractor: {len(extractor.features)}")
print(f"Number of columns in extracted_features_flattened: {extracted_features_flattened.shape[1]}")

Number of features in extractor: 9
Number of columns in extracted_features_flattened: 36


In [27]:
feature_columns = [f"feature_{i}" for i in range(extracted_features_flattened.shape[1])]

In [28]:
features = pd.DataFrame(extracted_features_flattened, columns=feature_columns)

In [29]:
print(features.shape)
print(features.head())

(121, 36)
    feature_0    feature_1 feature_2   feature_3   feature_4 feature_5  \
0  126.666667       1949.0       6.5  139.666667  188.242424       0.0   
1  126.916667  1949.083333       6.5  142.166667  180.992424  0.083333   
2  127.583333  1949.166667       6.5  144.166667  173.356061  0.151515   
3  128.333333      1949.25       6.5      147.25  187.333333  0.204545   
4  128.833333  1949.333333       6.5  149.583333  191.060606  0.242424   

  feature_6   feature_7 feature_8 feature_9  ... feature_26 feature_27  \
0      13.0   363.69697   0.99429       0.0  ...        0.0   0.059659   
1      13.0  304.151515  0.992977  0.841106  ...       0.25   0.018318   
2      13.0  281.606061  0.990952   0.92805  ...       0.16   0.000775   
3      13.0  374.386364  0.993914  0.966266  ...       0.09    0.03121   
4      13.0  377.356061  0.994544  0.984764  ...       0.04    0.01088   

  feature_28  feature_29 feature_30 feature_31  \
0   194604.0  45583212.0      650.0   238082.0   


In [32]:
# Zweryfikuj liczby cech
print(f"Number of features in extractor: {len(extractor.features)}")
print(f"Number of columns in extracted_features_flattened: {extracted_features_flattened.shape[1]}")

# Stwórz dynamiczne nazwy kolumn: cecha + indeks
feature_columns = []
for feature in extractor.features:
    feature_columns.extend([f"{feature}_part_{i}" for i in range(extracted_features_flattened.shape[1] // len(extractor.features))])

# Zweryfikuj długość nazw kolumn
print(f"Number of generated column names: {len(feature_columns)}")

# Upewnij się, że liczba kolumn zgadza się z danymi
if len(feature_columns) != extracted_features_flattened.shape[1]:
    raise ValueError("Mismatch between number of feature columns and extracted data dimensions!")

# Tworzenie DataFrame
features = pd.DataFrame(extracted_features_flattened, columns=feature_columns)

# Weryfikacja wyników
print(features.head())


Number of features in extractor: 9
Number of columns in extracted_features_flattened: 36
Number of generated column names: 36
  mean_part_0  mean_part_1 mean_part_2 mean_part_3 variance_part_0  \
0  126.666667       1949.0         6.5  139.666667      188.242424   
1  126.916667  1949.083333         6.5  142.166667      180.992424   
2  127.583333  1949.166667         6.5  144.166667      173.356061   
3  128.333333      1949.25         6.5      147.25      187.333333   
4  128.833333  1949.333333         6.5  149.583333      191.060606   

  variance_part_1 variance_part_2 variance_part_3 entropy_part_0  \
0             0.0            13.0       363.69697        0.99429   
1        0.083333            13.0      304.151515       0.992977   
2        0.151515            13.0      281.606061       0.990952   
3        0.204545            13.0      374.386364       0.993914   
4        0.242424            13.0      377.356061       0.994544   

  entropy_part_1  ... linearity_part_2 linea

In [33]:
# Generate column names dynamically based on the extracted features
feature_columns = [f"{features}_series" for features in extractor.features]
features = pd.DataFrame(extracted_features_flattened, columns=feature_columns)

# Verify results
print(features.head())

ValueError: Shape of passed values is (121, 36), indices imply (121, 9)

In [147]:
for start_idx in range(len(data) - window_size + 1):
    window = data.iloc[start_idx:start_idx + window_size].copy()
    sequences.append(window)

In [136]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Support Vector Regression": SVR(kernel="rbf"),
    "XGBoost": XGBRegressor(random_state=42),
}

In [137]:
# Fit models and evaluate
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(features_train, train_y)
    predictions = model.predict(features_test)

    rmse = np.sqrt(mean_squared_error(test_y, predictions))
    results[model_name] = rmse
    print(f"{model_name} RMSE: {rmse:.4f}\n")

Training Linear Regression...


ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required by LinearRegression.

In [None]:
# Display RMSE results
print("Model Performance (RMSE):")
for model, rmse in results.items():
    print(f"{model}: {rmse:.4f}")

In [None]:
# Plot predictions from the best model (lowest RMSE)
best_model_name = min(results, key=results.get)
print(f"Best Model: {best_model_name}")

best_model = models[best_model_name]
best_predictions = best_model.predict(features_test)

plt.figure(figsize=(14, 7))
plt.plot(data.index, data["rolling_avg_12"], label="True Values", alpha=0.7)
plt.plot(test_data["timestamp"], best_predictions, label=f"Predicted ({best_model_name})", alpha=0.7)
plt.xlabel("Time")
plt.ylabel("Passengers")
plt.title("True vs Predicted Values")
plt.legend()
plt.show()