In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import ast
import glob
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Flatten, Reshape, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [2]:
not_non_features = [
       'LOAD|1', 'LOAD|2', 'LOAD|3', 'LOAD|6',
       'ENC_POS|1', 'ENC_POS|2', 'ENC_POS|3','ENC_POS|6',
       'CTRL_DIFF2|1', 'CTRL_DIFF2|2', 'CTRL_DIFF2|3', 'CTRL_DIFF2|6',
       'TORQUE|1', 'TORQUE|2', 'TORQUE|3', 'TORQUE|6',
       'DES_POS|1', 'DES_POS|2', 'DES_POS|3', 'DES_POS|6',

       #'CURRENT|1', 'CURRENT|2', 'CURRENT|3',
        

       'CTRL_DIFF|1', 'CTRL_DIFF|2', 'CTRL_DIFF|3' ,'CTRL_DIFF|6',
       'CTRL_POS|1', 'CTRL_POS|2', 'CTRL_POS|3', 'CTRL_POS|6',
       'VEL_FFW|1', 'VEL_FFW|2','VEL_FFW|3', 'VEL_FFW|6',
       #'POWER|1','POWER|2', 'POWER|3', 'POWER|6',
       'CONT_DEV|1','CONT_DEV|2', 'CONT_DEV|3', 'CONT_DEV|6',
       #'A_DBD|0',
       'CMD_SPEED|1', 'CMD_SPEED|2', 'CMD_SPEED|3', 'CMD_SPEED|6',
       'TORQUE_FFW|1', 'TORQUE_FFW|2', 'TORQUE_FFW|3', 'TORQUE_FFW|6',
       'ENC1_POS|1', 'ENC1_POS|2', 'ENC1_POS|3','ENC1_POS|6',
       'ENC2_POS|1', 'ENC2_POS|2', 'ENC2_POS|3', 'ENC2_POS|6']
target = 'CURRENT|6'

In [3]:
file_path = ['C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/1_CMX/CMX1_AL_CP1.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/1_CMX/CMX1_AL_CP2.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/1_CMX/CMX1_S_CP1.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/1_CMX/CMX1_S_CP2.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/2_DMC/DMC2_AL_CP1.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/2_DMC/DMC2_AL_CP2.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/2_DMC/DMC2_S_CP1.csv',
             'C:/Users/genty/Hiwi-TimeSeries/dataset/DatAmount_dataset_new/2_DMC/DMC2_S_CP2.csv']

In [4]:
output_folder = "C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/"

# Global Model

In [5]:
result_summary = pd.read_csv("C:/Users/genty/Hiwi-TimeSeries/output/8dataset/result_summary_lstm_new.csv")
filtered_summary = result_summary[(result_summary["Data_Split"] == 0.2) & (result_summary["Features"] == "all")]
filtered_summary = filtered_summary.drop(columns=["Data_Split","Features","RunTime"])
filtered_summary = filtered_summary.reset_index(drop=True)
filtered_summary = filtered_summary[["Dataset","Train_R2","Test_R2","Train_RMSE","Test_RMSE"]]
filtered_summary.to_csv(os.path.join(output_folder, f"Global_LSTM_Summmary_2080.csv"), index=False)
print(filtered_summary)

           Dataset  Train_R2  Test_R2  Train_RMSE  Test_RMSE
0  CMX1_AL_CP1.csv    0.9996   0.8577      0.0172     0.3800
1  CMX1_AL_CP2.csv    0.9988  -1.0846      0.0300     1.1650
2   CMX1_S_CP1.csv    1.0000   0.8420      0.0119     1.0510
3   CMX1_S_CP2.csv    0.9999  -0.3814      0.0142     1.5052
4  DMC2_AL_CP1.csv    0.9990  -4.4139      0.0344     4.1321
5  DMC2_AL_CP2.csv    0.9955  -1.2232      0.1405     4.5284
6   DMC2_S_CP1.csv    0.9997   0.1944      0.0590     2.8296
7   DMC2_S_CP2.csv    0.9901  -0.1073      0.2122     3.0129


In [40]:
for file in file_path:
    df = pd.read_csv(file)
    print(len(df))

433666
74822
244772
45468
510050
64337
263913
42016


# Z-Score

In [5]:
def thresholding_algo(y, lag, influence, threshold):
    signals = np.zeros(len(y))
    filteredY = [0]*len(y)
    avgFilter = [0]*len(y)
    stdFilter = [0]*len(y)
    avgFilter[- 1] = np.mean(y[0:lag])
    stdFilter[- 1] = np.std(y[0:lag])
    for i in range(len(y)):
        #threshold = np.mean(y[i:i+lag])
        if abs(y[i] - avgFilter[i-1]) > threshold * stdFilter [i-1]:
            if y[i] > avgFilter[i-1]:
                signals[i] = 1
            else:
                signals[i] = -1

            filteredY[i] = influence * y[i] + (1 - influence) * filteredY[i-1]
            avgFilter[i] = np.mean(filteredY[(i-lag+1):i+1])
            stdFilter[i] = np.std(filteredY[(i-lag+1):i+1])
        else:
            signals[i] = 0
            filteredY[i] = y[i]
            avgFilter[i] = np.mean(filteredY[(i-lag+1):i+1])
            stdFilter[i] = np.std(filteredY[(i-lag+1):i+1])

    return dict(signals = np.asarray(signals),
                avgFilter = np.asarray(avgFilter),
                stdFilter = np.asarray(stdFilter))

In [6]:
def get_output_peaks(y, lag, influence, threshold):  
    result = thresholding_algo(y, lag, influence, threshold)
    peak_indices = [i for i, signal in enumerate(result['signals']) if signal != 0]
    #peak_info = get_peak_indices_by_column(df, lag, influence)
    peaks_df = pd.DataFrame({'Lag': lag,
                             'Influence': influence,
                             'Threshold': threshold,
                             'Num_Peaks': len(peak_indices),
                             'Peak_Indices': [peak_indices]})
    return peaks_df

In [None]:
for file in file_path:
    print("-------Data ---------")
    df = pd.read_csv(file)
    X = df[not_non_features]
    y = df[[target]]
    y = y.squeeze()
    df_p = pd.DataFrame()
    lag = [5000, 10000]
    influence = [0.5, 0.9]
    threshold = [0.5*y.abs().mean(), y.abs().mean(), 2*y.abs().mean()]
    filename = os.path.join(output_folder, f'Peaks_Info/{os.path.basename(file)}_peaks.csv')
    for lag_val in lag:
        for inf in influence:
            for th in threshold:
                print(lag_val, inf, th)
                print(type(y), y.shape)
                com_pk_df = get_output_peaks(y, lag_val, inf, th)
                print(com_pk_df)
                df_p = pd.concat([df_p, com_pk_df], ignore_index=True)
                df_p.to_csv(filename, index=False)


In [None]:
df_ = pd.read_csv
new_df = df_.drop(columns=["Peak_Indices"])
print(new_df)

# Statistical Tests

In [7]:
output_folder = 'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Stat/'
anova_results = []
tukey_results = []

In [8]:
# Function to run ANOVA and apply Tukey's test on significant factors
def analyze_dataset(df, filename):
    model = ols('Num_Peaks ~ C(Lag) + C(Influence) + C(Threshold) + \
                 C(Lag):C(Influence) + C(Lag):C(Threshold) + C(Influence):C(Threshold)', 
                 data=df).fit()
    
    anova_table = sm.stats.anova_lm(model, typ=2)
    print("\nANOVA Results:\n", anova_table)
    anova_table["Dataset"] = filename  
    anova_table.to_csv(os.path.join(output_folder, f"anova_{filename}.csv"), index=True)
    
    significant_factors = anova_table[anova_table["PR(>F)"] < 0.05].index.tolist()
    print("\nSignificant Factors:", significant_factors)
    
    tukey_results = []
    # Run Tukey’s HSD for each significant factor
    for factor in significant_factors:
        if "C(" in factor:  
            factor_name = factor.split('(')[1].split(')')[0]
            tukey_test = pairwise_tukeyhsd(df['Num_Peaks'], df[factor_name])
            tukeydf = pd.DataFrame(data=tukey_test._results_table.data[1:], columns=tukey_test._results_table.data[0])
            tukeydf["Dataset"] = filename
            tukeydf["Factor"] = factor_name
            tukey_results.append(tukeydf)
            print(f"\nTukey’s HSD for {factor_name}:\n", tukey_test)

    if tukey_results:
        tukey_results_df = pd.concat(tukey_results, ignore_index=True)
        tukey_results_df.to_csv(os.path.join(output_folder, f"tukey_{filename}.csv"), index=False)

In [9]:
folder_path = "C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Peaks_Info/"  
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    print(f"\n========== Dataset ==========")
    analyze_dataset(df, os.path.basename(file))



ANOVA Results:
                                  sum_sq   df             F    PR(>F)
C(Lag)                     6.164193e+08  1.0    926.781106  0.001077
C(Influence)               1.138411e+07  1.0     17.115913  0.053757
C(Threshold)               9.437151e+10  2.0  70943.375254  0.000014
C(Lag):C(Influence)        4.076336e+06  1.0      6.128736  0.131692
C(Lag):C(Threshold)        7.093251e+07  2.0     53.323203  0.018408
C(Influence):C(Threshold)  2.844454e+06  2.0      2.138306  0.318643
Residual                   1.330237e+06  2.0           NaN       NaN

Significant Factors: ['C(Lag)', 'C(Threshold)', 'C(Lag):C(Threshold)']

Tukey’s HSD for Lag:
       Multiple Comparison of Means - Tukey HSD, FWER=0.05       
group1 group2   meandiff  p-adj     lower        upper    reject
----------------------------------------------------------------
  5000  10000 -14334.3333 0.8036 -139363.2249 110694.5583  False
----------------------------------------------------------------

Tukey’s H

In [10]:
for file in csv_files:
    df = pd.read_csv(file)
    print(f"\n========== Dataset ==========")
    X = df[["Lag", "Influence", "Threshold"]]
    X = sm.add_constant(X)
    y = df["Num_Peaks"]

    model = sm.OLS(y, X).fit()
    print(model.summary())



                            OLS Regression Results                            
Dep. Variable:              Num_Peaks   R-squared:                       0.949
Model:                            OLS   Adj. R-squared:                  0.931
Method:                 Least Squares   F-statistic:                     50.12
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           1.57e-05
Time:                        23:09:17   Log-Likelihood:                -135.87
No. Observations:                  12   AIC:                             279.7
Df Residuals:                       8   BIC:                             281.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.448e+05   3.59e+04      9.606      0.

  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)


# Data Split

In [11]:
def split_train_test_peak_nonpeak(X,y,peak_indices):
    train_size = int(len(X) * 0.3)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    train_peak_indices = [i for i in peak_indices if i < len(X_train)]
    test_peak_indices = [i for i in peak_indices if i >= len(X_train)]

    X_train_peak = X_train.loc[train_peak_indices]
    y_train_peak = y_train.loc[train_peak_indices]
    X_train_non_peak = X_train.drop(train_peak_indices)
    y_train_non_peak = y_train.drop(train_peak_indices)

    X_test_peak = X_test.loc[test_peak_indices]
    y_test_peak = y_test.loc[test_peak_indices]
    X_test_non_peak = X_test.drop(test_peak_indices)
    y_test_non_peak = y_test.drop(test_peak_indices)
    
    return X_train_peak, y_train_peak, X_train_non_peak, y_train_non_peak, X_test_peak, y_test_peak, X_test_non_peak, y_test_non_peak

In [26]:
def get_best_features(dataset_name):
    anova_file = f"anova_{dataset_name}.csv"
    tukey_file = f"tukey_{dataset_name}.csv"
    anova_df = pd.read_csv(f'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Stat/{anova_file}')
    tukey_df = pd.read_csv(f'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Stat/{tukey_file}')

    anova_df["Feature"] = anova_df.iloc[:, 0].str.extract(r'C\((.*?)\)')  
    anova_features = anova_df[anova_df["PR(>F)"] < 0.05]["Feature"].dropna().tolist()
    tukey_features = tukey_df[tukey_df["p-adj"] < 0.05]["Factor"].dropna().tolist()
    best_features = list(set(anova_features) & set(tukey_features))
    
    best_feature_values = {"Lag": None, "Influence": None, "Threshold": None}
    for feature in best_features:
        if feature in best_feature_values:
            best_value_row = tukey_df[(tukey_df["Factor"] == feature) & (tukey_df["reject"] == True)].nsmallest(1, "p-adj")
            if not best_value_row.empty:
                best_feature_values[feature] = float(best_value_row["group1"].values[0])
    if best_feature_values["Lag"] is None:
        best_feature_values["Lag"] = 5000  
    if best_feature_values["Influence"] is None:
        best_feature_values["Influence"] = 0.5  
    if best_feature_values["Threshold"] is None:
        best_feature_values["Threshold"] = 0.5 
        
    return best_feature_values

In [27]:
def get_peak_indices(peak_df, best_feature_values):
    selected_row = peak_df[
        (peak_df["Lag"] == best_feature_values.get("Lag")) &
        (peak_df["Influence"] == best_feature_values.get("Influence")) &
        (peak_df["Threshold"] == best_feature_values.get("Threshold"))
    ]

    if selected_row.empty:
        print(f"No matching peak row found")
        return []
    peak_indices = ast.literal_eval(selected_row["Peak_Indices"].values[0])
    peak_indices = list(map(int, peak_indices))

    return peak_indices

In [28]:
peaks_info_list = ['C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/CMX1_AL_CP1.csv_peaks.csv',
                   'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/CMX1_AL_CP2.csv_peaks.csv',
                    'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/CMX1_S_CP1.csv_peaks.csv',
                    'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/CMX1_S_CP2.csv_peaks.csv',
                    'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/DMC2_AL_CP1.csv_peaks.csv',
                    'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/DMC2_AL_CP2.csv_peaks.csv',
                    'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/DMC2_S_CP1.csv_peaks.csv',
                    'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeaksInfo/DMC2_S_CP2.csv_peaks.csv']

In [29]:
for i in range(8):
    print("Dataset:", i+1)
    df = pd.read_csv(file_path[i])
    df = df.abs()
    X = df[not_non_features]
    y = df[[target]]

    out_folder = f'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Data_Split/Data{i+1}'
    peak_info = pd.read_csv(peaks_info_list[i])
    peak_info_name = os.path.basename(peaks_info_list[i])
    print(peak_info_name)
    best_feature_values = get_best_features(peak_info_name)
    print(best_feature_values)
    peak_indices_list = get_peak_indices(peak_info, best_feature_values)
    print(peak_indices_list)

    X_train_peak, y_train_peak, X_train_non_peak, y_train_non_peak, X_test_peak, y_test_peak, X_test_non_peak, y_test_non_peak = split_train_test_peak_nonpeak(X,y,peak_indices_list)
    X_train_peak.to_csv(os.path.join(out_folder, f'X_train_p_{os.path.basename(file_path[i])}.csv'), index=True)
    y_train_peak.to_csv(os.path.join(out_folder, f'y_train_p_{os.path.basename(file_path[i])}.csv'), index=True)
    X_train_non_peak.to_csv(os.path.join(out_folder, f'X_train_np_{os.path.basename(file_path[i])}.csv'), index=True)
    y_train_non_peak.to_csv(os.path.join(out_folder, f'y_train_np_{os.path.basename(file_path[i])}.csv'), index=True)
    X_test_peak.to_csv(os.path.join(out_folder, f'X_test_p_{os.path.basename(file_path[i])}.csv'), index=True)
    y_test_peak.to_csv(os.path.join(out_folder, f'y_test_p_{os.path.basename(file_path[i])}.csv'), index=True)
    X_test_non_peak.to_csv(os.path.join(out_folder, f'X_test_np_{os.path.basename(file_path[i])}.csv'), index=True)
    y_test_non_peak.to_csv(os.path.join(out_folder, f'y_test_np_{os.path.basename(file_path[i])}.csv'), index=True)

Dataset: 1
CMX1_AL_CP1.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 0.5654305258320919}
[0, 5002, 5003, 5004, 5009, 5013, 5018, 5019, 5020, 5021, 5022, 5025, 5028, 5034, 5035, 5036, 5037, 5038, 5040, 5044, 5050, 5051, 5052, 5053, 5054, 5059, 5063, 5067, 5068, 5069, 5070, 5074, 5078, 5083, 5084, 5085, 5086, 5090, 5093, 5094, 5099, 5100, 5101, 5102, 5103, 5105, 5106, 5109, 5110, 5115, 5116, 5118, 5119, 5124, 5125, 5131, 5132, 5133, 5134, 5135, 5140, 5148, 5149, 5150, 5155, 5159, 5164, 5165, 5166, 5170, 5171, 5174, 5175, 5180, 5181, 5183, 5184, 5186, 5187, 5189, 5190, 5191, 5192, 5196, 5197, 5199, 5200, 5205, 5206, 5209, 5213, 5214, 5215, 5220, 5221, 5223, 5224, 5225, 5230, 5231, 5236, 5237, 5239, 5240, 5245, 5246, 5247, 5251, 5252, 5255, 5256, 5257, 5261, 5262, 5264, 5267, 5268, 5269, 5270, 5271, 5272, 5274, 5277, 5278, 5280, 5285, 5286, 5287, 5288, 5289, 5290, 5295, 5296, 5301, 5302, 5304, 5305, 5306, 5311, 5312, 5317, 5318, 5319, 5320, 5321, 5322, 5323, 5327, 5332, 5333, 

In [None]:
Dataset: 1
CMX1_AL_CP1.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 0.5654305258320919}
Dataset: 2
CMX1_AL_CP2.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 0.3970678243030125}
Dataset: 3
CMX1_S_CP1.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 1.3724146174848433}
Dataset: 4
CMX1_S_CP2.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 0.7526044037564882}
Dataset: 5
DMC2_AL_CP1.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 0.7007696713704539}
Dataset: 6
DMC2_AL_CP2.csv_peaks.csv
{'Lag': 5000.0, 'Influence': 0.5, 'Threshold': 0.5}
No matching peak row found
[]
Dataset: 7
DMC2_S_CP1.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 1.3069443327668586}
Dataset: 8
DMC2_S_CP2.csv_peaks.csv
{'Lag': 5000, 'Influence': 0.5, 'Threshold': 0.9355751005093296}

### Classification

In [30]:
base_folder = 'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Data_Split/'
subfolders = ['Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7', 'Data8']
output_folder = 'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Classification/'

In [31]:
def has_peaks(file_path):
    file_name = os.path.basename(file_path)    
    if 'np' in file_name.lower():
        return 0  
    else:
        return 1  

In [32]:
i = 1
for subfolder in subfolders:
    all_data_df = pd.DataFrame()
    folder_path = os.path.join(base_folder, subfolder)
    for file_name in os.listdir(folder_path):
        if file_name.startswith('X'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path, index_col=0)
            target_value = 1 if has_peaks(file_path) else 0
            df['target'] = target_value
            all_data_df = pd.concat([all_data_df, df], ignore_index=False)
    all_data_df.sort_index(inplace=True)

    all_data_df.to_csv(os.path.join(output_folder, f'Data{i}_Classes.csv'), index=False)
    i = i+1

  all_data_df = pd.concat([all_data_df, df], ignore_index=False)
  all_data_df = pd.concat([all_data_df, df], ignore_index=False)
  all_data_df = pd.concat([all_data_df, df], ignore_index=False)
  all_data_df = pd.concat([all_data_df, df], ignore_index=False)


# Classification

In [33]:
folder = 'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Classification/'
classification_result = pd.DataFrame(columns=['dataset', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1'])

In [34]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

import joblib

In [35]:
for file_name in os.listdir(folder):
    print(file_name)
    file_path = os.path.join(folder,file_name)
    df_class = pd.read_csv(file_path)
    X = df_class[not_non_features]
    y = df_class['target'].values.ravel()
    train_size = int(len(X) * 0.2)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=len(y_train) / y_train.sum())

    rf_model.fit(X_train, y_train)
    rf_train_pred = rf_model.predict(X_train)
    rf_test_pred = rf_model.predict(X_test)

    joblib.dump(rf_model, os.path.join(folder, f'RF_{file_name}.pkl'))


    rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
    rf_test_accuracy = accuracy_score(y_test, rf_test_pred)
    rf_train_f1 = f1_score(y_train, rf_train_pred, average='weighted')
    rf_test_f1 = f1_score(y_test, rf_test_pred, average='weighted')

    result = pd.DataFrame([{
        'dataset': file_name + ' RF',
        'train_accuracy': rf_train_accuracy,
        'test_accuracy': rf_test_accuracy,
        'train_f1': rf_train_f1,
        'test_f1': rf_test_f1
    }])
    classification_result = pd.concat([classification_result, result], ignore_index=True)

    xgb_model.fit(X_train, y_train)
    xgb_train_pred = xgb_model.predict(X_train)
    xgb_test_pred = xgb_model.predict(X_test)

    joblib.dump(xgb_model, os.path.join(folder, f'XGB_{file_name}.pkl'))


    xgb_train_accuracy = accuracy_score(y_train, xgb_train_pred)
    xgb_test_accuracy = accuracy_score(y_test, xgb_test_pred)
    xgb_train_f1 = f1_score(y_train, xgb_train_pred, average='weighted')
    xgb_test_f1 = f1_score(y_test, xgb_test_pred, average='weighted')

    result = pd.DataFrame([{
        'dataset': file_name + ' XGB',
        'train_accuracy': rf_train_accuracy,
        'test_accuracy': rf_test_accuracy,
        'train_f1': rf_train_f1,
        'test_f1': rf_test_f1
    }])
    classification_result = pd.concat([classification_result, result], ignore_index=True)

classification_result.to_csv(os.path.join(folder, 'Class2080_ResultSummary.csv'), index=False)


Data1_Classes.csv


  classification_result = pd.concat([classification_result, result], ignore_index=True)
Parameters: { "use_label_encoder" } are not used.



Data2_Classes.csv


Parameters: { "use_label_encoder" } are not used.



Data3_Classes.csv


Parameters: { "use_label_encoder" } are not used.



Data4_Classes.csv


Parameters: { "use_label_encoder" } are not used.



Data5_Classes.csv


Parameters: { "use_label_encoder" } are not used.



Data6_Classes.csv


  xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=len(y_train) / y_train.sum())
Parameters: { "use_label_encoder" } are not used.



Data7_Classes.csv


Parameters: { "use_label_encoder" } are not used.



Data8_Classes.csv


Parameters: { "use_label_encoder" } are not used.



In [36]:
print(classification_result) #20-80

                  dataset  train_accuracy  test_accuracy  train_f1   test_f1
0    Data1_Classes.csv RF             1.0       0.536043       1.0  0.548778
1   Data1_Classes.csv XGB             1.0       0.536043       1.0  0.548778
2    Data2_Classes.csv RF             1.0       0.633282       1.0  0.507159
3   Data2_Classes.csv XGB             1.0       0.633282       1.0  0.507159
4    Data3_Classes.csv RF             1.0       0.715195       1.0  0.639093
5   Data3_Classes.csv XGB             1.0       0.715195       1.0  0.639093
6    Data4_Classes.csv RF             1.0       0.623478       1.0  0.531014
7   Data4_Classes.csv XGB             1.0       0.623478       1.0  0.531014
8    Data5_Classes.csv RF             1.0       0.583874       1.0  0.581626
9   Data5_Classes.csv XGB             1.0       0.583874       1.0  0.581626
10   Data6_Classes.csv RF             1.0       1.000000       1.0  1.000000
11  Data6_Classes.csv XGB             1.0       1.000000       1.0  1.000000

# Peak Aware Model

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [5]:
peak_result_df = pd.DataFrame(columns=['Dataset', 'Train_R2', 'Test_R2', 'Train_RMSE', 'Test_RMSE'])
#peak_result_df = pd.read_csv("C:/Users/genty/Hiwi-TimeSeries/output/Autoencoder/PeakAwareModel/Peaks_ResultSummary.csv")
output_folder = 'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/PeakAwareModel/'
base_folder = 'C:/Users/genty/Hiwi-TimeSeries/output/Z-Score/Classification/'
filenamefinal = os.path.join(output_folder, 'Peaks_ResultSummary_2080.csv')

In [6]:
def split_rf(X,y, model):
    train_size = int(len(X) * 0.2)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    
    # Step 2: Use the rfmodel to predict peaks on the training set
    peak_predictions_train = model.predict(X_train)
    peak_predictions_test = model.predict(X_test)

    # Step 3: Separate peak and non-peak data points in the training set
    X_train_peak = X_train[peak_predictions_train == 1]
    y_train_peak = y_train[peak_predictions_train == 1]
    X_train_non_peak = X_train[peak_predictions_train == 0]
    y_train_non_peak = y_train[peak_predictions_train == 0]

    # Step 4: Separate peak and non-peak data points in the test set
    X_test_peak = X_test[peak_predictions_test == 1]
    y_test_peak = y_test[peak_predictions_test == 1]
    X_test_non_peak = X_test[peak_predictions_test == 0]
    y_test_non_peak = y_test[peak_predictions_test == 0]

    return X_train_peak, y_train_peak, X_train_non_peak, y_train_non_peak, X_test_peak, y_test_peak, X_test_non_peak, y_test_non_peak


In [7]:
def split_lstm(X,y, model):
    train_size = int(len(X) * 0.2)

    scaler_X = MinMaxScaler(feature_range=(0, 1))
    scaler_y = MinMaxScaler(feature_range=(0, 1))
    scaler_X.fit(X)
    scaler_y.fit(y)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    peak_predictions_train = model.predict(X_train)
    peak_predictions_test = model.predict(X_test)

    X_train_peak = X_train[peak_predictions_train == 1]
    y_train_peak = y_train[peak_predictions_train == 1]
    X_train_non_peak = X_train[peak_predictions_train == 0]
    y_train_non_peak = y_train[peak_predictions_train == 0]
    X_train_peak = scaler_X.transform(X_train_peak)
    X_train_non_peak = scaler_X.transform(X_train_non_peak)
    y_train_peak = scaler_y.transform(y_train_peak)
    y_train_non_peak = scaler_y.transform(y_train_non_peak)

    X_test_peak = X_test[peak_predictions_test == 1]
    y_test_peak = y_test[peak_predictions_test == 1]
    X_test_non_peak = X_test[peak_predictions_test == 0]
    y_test_non_peak = y_test[peak_predictions_test == 0]
    X_test_peak = scaler_X.transform(X_test_peak)
    y_test_peak = scaler_y.transform(y_test_peak)
    X_test_non_peak = scaler_X.transform(X_test_non_peak)
    y_test_non_peak = scaler_y.transform(y_test_non_peak)

    return X_train_peak, y_train_peak, X_train_non_peak, y_train_non_peak, X_test_peak, y_test_peak, X_test_non_peak, y_test_non_peak, scaler_X, scaler_y


In [8]:
def rf_peaks_model(X_train_peakrf, y_train_peakrf, X_test_peakrf, y_test_peakrf):
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train_peakrf, y_train_peakrf)
    train_pred = rf_model.predict(X_train_peakrf)
    test_pred = rf_model.predict(X_test_peakrf)
    #print(y_pred)
    y_train = y_train_peakrf.squeeze()
    y_test = y_test_peakrf.squeeze()

    train_df = pd.DataFrame({
            'Train_y': y_train,
            'Train_pred_y': train_pred })
    filename = os.path.join(output_folder, f'p_trains_rf_xgb_{os.path.basename(file_path[i])}.csv')
    train_df.to_csv(filename, index=False)

    test_df = pd.DataFrame({
            'Test_y': y_test,
            'Test_pred_y': test_pred })
    filename = os.path.join(output_folder, f'p_tests_rf_xgb_{os.path.basename(file_path[i])}.csv')
    test_df.to_csv(filename, index=False)

    train_mse = mean_squared_error(y_train, train_pred)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, train_pred)
    test_mse = mean_squared_error(y_test, test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, test_pred)

    return train_r2, train_rmse, test_r2, test_rmse

In [9]:
def rf_non_peaks_model(X_train_non_peakrf, y_train_non_peakrf, X_test_non_peakrf, y_test_non_peakrf):
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train_non_peakrf, y_train_non_peakrf)
    train_pred = rf_model.predict(X_train_non_peakrf)
    test_pred = rf_model.predict(X_test_non_peakrf)
    #print(y_pred)
    y_train = y_train_non_peakrf.squeeze()
    y_test = y_test_non_peakrf.squeeze()

    train_df = pd.DataFrame({
            'Train_y': y_train,
            'Train_pred_y': train_pred })
    filename = os.path.join(output_folder, f'np_trains_rf_xgb_{os.path.basename(file_path[i])}.csv')
    train_df.to_csv(filename, index=False)

    test_df = pd.DataFrame({
            'Test_y': y_test,
            'Test_pred_y': test_pred })
    filename = os.path.join(output_folder, f'np_tests_rf_xgb_{os.path.basename(file_path[i])}.csv')
    test_df.to_csv(filename, index=False)

    train_mse = mean_squared_error(y_train, train_pred)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, train_pred)
    test_mse = mean_squared_error(y_test, test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, test_pred)

    return train_r2, train_rmse, test_r2, test_rmse

In [34]:
i=7
df = pd.read_csv(file_path[i])
df = df.abs()
X = df[not_non_features]
y = df[[target]]
rf_model = joblib.load(os.path.join(base_folder, f'RF_Data{i+1}_Classes.csv.pkl'))
#xgb_model = joblib.load(os.path.join(base_folder, f'XGB_Data{i+1}_Classes.csv.pkl'))


#X_train_peakrf, y_train_peakrf, X_train_non_peakrf, y_train_non_peakrf, X_test_peakrf, y_test_peakrf, X_test_non_peakrf, y_test_non_peakrf = split_rf(X, y, xgb_model)
X_train_peak, y_train_peak, X_train_non_peak, y_train_non_peak, X_test_peak, y_test_peak, X_test_non_peak, y_test_non_peak, scaler_X, scaler_y = split_lstm(X, y, rf_model)



ValueError: Found array with 0 sample(s) (shape=(0, 52)) while a minimum of 1 is required by MinMaxScaler.

In [11]:
def create_dataset(X, y, time_step=60):
    if len(X) <= time_step:  
        time_step = len(X) - 1
    Xs, ys = [], []
    for i in range(len(X) - time_step):
        Xs.append(X[i:(i + time_step)])
        ys.append(y[i + time_step])
    return np.array(Xs), np.array(ys)

In [12]:
def build_model(input_shape):
        model = Sequential()
        model.add(LSTM(64, return_sequences=True, input_shape=input_shape))
        model.add(LSTM(32, return_sequences=False))
        model.add(Dense(25))
        model.add(Dense(1))
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        return model

In [35]:
time_step = 60

X_train, y_train = create_dataset(X_train_peak, y_train_peak, time_step)
X_test, y_test = create_dataset(X_test_peak, y_test_peak, time_step)

input_shape = (X_train.shape[1], X_train.shape[2])
model = build_model(input_shape)  

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Make Predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# Inverse transform predictions
train_predict_inv = scaler_y.inverse_transform(train_predict)
y_train_inv = scaler_y.inverse_transform(y_train)
test_predict_inv = scaler_y.inverse_transform(test_predict)
y_test_inv = scaler_y.inverse_transform(y_test)

train_df = pd.DataFrame({
        'Train_y': y_train_inv.flatten(),
        'Train_pred_y': train_predict_inv.flatten()
    })

filename = os.path.join(output_folder, f'p_trains_lstm_rf_{os.path.basename(file_path[i])}.csv')
train_df.to_csv(filename, index=False)
test_df = pd.DataFrame({
        'Test_y': y_test_inv.flatten(),
        'Test_pred_y': test_predict_inv.flatten()
    })

filename = os.path.join(output_folder, f'p_tests_lstm_rf_{os.path.basename(file_path[i])}.csv')
test_df.to_csv(filename, index=False)

# Calculate R2 and RMSE
train_r2 = r2_score(y_train_inv, train_predict_inv)
test_r2 = r2_score(y_test_inv, test_predict_inv)
train_rmse = np.sqrt(mean_squared_error(y_train_inv, train_predict_inv))
test_rmse = np.sqrt(mean_squared_error(y_test_inv, test_predict_inv))

# Append the results to the dataframe
dum_df = pd.DataFrame([{
    'Dataset': f'Peak_LSTM_RF_{os.path.basename(file_path[i])}',
    'Train_R2': train_r2,
    'Train_RMSE': train_rmse,
    'Test_R2': test_r2,
    'Test_RMSE': test_rmse
}])
peak_result_df = pd.concat([peak_result_df,dum_df], ignore_index=True)
peak_result_df.to_csv(filenamefinal, index=False)

  super().__init__(**kwargs)


Epoch 1/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 41ms/step - loss: 9.9094e-04 - val_loss: 2.5167e-04
Epoch 2/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 41ms/step - loss: 2.3459e-05 - val_loss: 1.5375e-04
Epoch 3/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 39ms/step - loss: 4.0496e-05 - val_loss: 2.8619e-04
Epoch 4/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 40ms/step - loss: 3.6043e-05 - val_loss: 2.2820e-04
Epoch 5/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 40ms/step - loss: 2.1642e-05 - val_loss: 2.6411e-04
Epoch 6/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 40ms/step - loss: 1.9112e-05 - val_loss: 1.6459e-04
Epoch 7/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 41ms/step - loss: 1.7435e-05 - val_loss: 1.0665e-04
Epoch 8/50
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 40ms/step - 

In [36]:
time_step = 60

X_train, y_train = create_dataset(X_train_non_peak, y_train_non_peak, time_step)
X_test, y_test = create_dataset(X_test_non_peak, y_test_non_peak, time_step)

input_shape = (X_train.shape[1], X_train.shape[2])
model = build_model(input_shape)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Make Predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# Inverse transform predictions
train_predict_inv = scaler_y.inverse_transform(train_predict)
y_train_inv = scaler_y.inverse_transform(y_train)
test_predict_inv = scaler_y.inverse_transform(test_predict)
y_test_inv = scaler_y.inverse_transform(y_test)

train_df = pd.DataFrame({
        'Train_y': y_train_inv.flatten(),
        'Train_pred_y': train_predict_inv.flatten()
    })

filename = os.path.join(output_folder, f'np_trains_lstm_rf_{os.path.basename(file_path[i])}.csv')
train_df.to_csv(filename, index=False)
test_df = pd.DataFrame({
        'Test_y': y_test_inv.flatten(),
        'Test_pred_y': test_predict_inv.flatten()
    })

filename = os.path.join(output_folder, f'np_tests_lstm_rf_{os.path.basename(file_path[i])}.csv')
test_df.to_csv(filename, index=False)

# Calculate R2 and RMSE
train_r2 = r2_score(y_train_inv, train_predict_inv)
test_r2 = r2_score(y_test_inv, test_predict_inv)
train_rmse = np.sqrt(mean_squared_error(y_train_inv, train_predict_inv))
test_rmse = np.sqrt(mean_squared_error(y_test_inv, test_predict_inv))

# Append the results to the dataframe
dum_df = pd.DataFrame([{
    'Dataset': f'NonPeak_LSTM_RF_{os.path.basename(file_path[i])}',
    'Train_R2': train_r2,
    'Train_RMSE': train_rmse,
    'Test_R2': test_r2,
    'Test_RMSE': test_rmse
}])
peak_result_df = pd.concat([peak_result_df,dum_df], ignore_index=True)
peak_result_df.to_csv(filenamefinal, index=False)

  super().__init__(**kwargs)


Epoch 1/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 64ms/step - loss: 6.2776e-04 - val_loss: 5.0207e-04
Epoch 2/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 62ms/step - loss: 1.5819e-05 - val_loss: 4.0261e-04
Epoch 3/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 62ms/step - loss: 1.6004e-05 - val_loss: 4.0604e-04
Epoch 4/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 62ms/step - loss: 1.2911e-05 - val_loss: 3.8185e-04
Epoch 5/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 61ms/step - loss: 1.0667e-05 - val_loss: 3.6860e-04
Epoch 6/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 62ms/step - loss: 9.2329e-06 - val_loss: 4.0935e-04
Epoch 7/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 62ms/step - loss: 6.5969e-06 - val_loss: 3.8449e-04
Epoch 8/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s