In [1]:
import sys
sys.path.insert(1,'D:\\thesis-main\codes')
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
%matplotlib inline
import time
from package.loadDb import *
from package.featureSelection_TENSORIZED import *
from package.featureSelection_TEMPORAL_PAGERANK import *
from package.featureSelection_SICE import *
from package.trmf import *
from package.trmf_regressor import *
from package.regressors import *

import concurrent.futures

In [2]:
# import kaggle dataset
address = "..\database\kaggleDataFrame"

data_with_target= load_kaggle_dataframe(address+"\dataframes.csv")

# data_with_target=data_with_target.iloc[2000:4000]

data_without_target = data_with_target.drop(['DEMAND'],axis=1)

target=data_with_target.iloc[:,-1]
print("data with target size = ",data_with_target.shape )


# # data with target column
# address = "..\database\stock market dataset"
# data_with_target=load_stock_market_dataframe(address+"\sample_normalized_with_target.csv")
# # data_with_target=data_with_target.iloc[:10000]
# data_without_target = data_with_target.drop(['TARGET'],axis=1)
# # data_without_target=data_with_target.iloc[:,:-1]
# target=data_with_target.iloc[:,-1]

data with target size =  (46968, 11)


In [3]:
#segment sizes
interval=2000
testSize=500
#energy of pca to compute number of components
tol=.89

In [4]:
#dictionary contains rmse of test set of each segment for all methods
dict_rmse={}
dict_runtime={}

## PCA

In [5]:

counter = 0
pca_list = [None] * (data_without_target.shape[0] // interval)
num_comp_list = [None] * (data_without_target.shape[0] // interval)
runtime = [None] * (data_without_target.shape[0] // interval)

def process_interval(i, interval, data_with_target, tol, testSize):
    start_index = i * interval
    end_index = start_index + interval

    datacut = data_with_target[start_index:end_index]
    X_datacut = datacut.iloc[:, :-1]
    Y_datacut = datacut.iloc[:, -1]

    st = time.time()
    X_transformed, eigenvalues = compute_pca(X_datacut, num_comp=None)
    et = time.time()
    num_comp = energy(eigenvalues, tol)
    X_pca = X_transformed[:, :num_comp]

    pca_rmse = xgboost_reg_error(X_pca, Y_datacut, testSize)

    elapsed_time = et - st

    return i, pca_rmse, num_comp, elapsed_time

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(process_interval, i, interval, data_with_target, tol, testSize)
        for i in range(data_without_target.shape[0] // interval)
    ]

    for future in concurrent.futures.as_completed(futures):
        i, pca_rmse, num_comp, elapsed_time = future.result()
        pca_list[i] = pca_rmse
        num_comp_list[i] = num_comp
        runtime[i] = elapsed_time

dict_rmse['PCA'] = pca_list
dict_runtime['PCA_runtime'] = runtime


In [6]:
num_comp_list

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1]

## TRMF

In [7]:
# trmf parameters..............................................
lags = [1,30]
# K = 4######### changes besed on pca energy
lambda_f = 1.
lambda_x = 1
lambda_w = 1.
alpha = 1000.
eta = 1.
max_iteration=10000
# num_comp=K

In [8]:
trmf_rmse_list=[]
runtime=[]

counter=0
i=0
# while counter+interval <= data_without_target.shape[0]:
for j in range(data_without_target.shape[0]//interval):
    start_index=j*interval
    end_index=start_index+interval
    
    K=num_comp_list[i]

    data=data_without_target.to_numpy().T

    data=data[:,start_index:end_index]
    target_cut=target.iloc[start_index:end_index]

    T_train = data.shape[1]
    st=time.time()#>>>>>>>>>>>>>>>>>>>>>>
    data_normalized =normalized_data(data, T_train, 0, normalize=True)

    model = trmf(lags, K, lambda_f, lambda_x, lambda_w, alpha, eta)
    model.fit(data_normalized, max_iter = max_iteration)
    et=time.time()#>>>>>>>>>>>>>>>>>>>>>>
    elapsed_time = et - st
    runtime.append(elapsed_time)

    # get reduced data
    new_features=model.X.T
    
    rmse=xgboost_reg_error(new_features,target_cut,testSize)
    
    trmf_rmse_list.append(rmse)
    # counter+=interval
    i=i+1
    # print(i,K)
    # print("----------------------------------------------------------------------------")
dict_rmse['trmf_rmse']=trmf_rmse_list
dict_runtime["TRMF_runtime"]=runtime


In [9]:
print(trmf_rmse_list)
print(runtime)

[0.08398931604651906, 0.07104994845827717, 0.05598144593316495, 0.08655605979159094, 0.06949270360666275, 0.06838746598877037, 0.048414076726112365, 0.0768359489814334, 0.058169258840248206, 0.08082596291628372, 0.05178493502720175, 0.05080749884785462, 0.0916186561910511, 0.08384885004070677, 0.04826371840655326, 0.04239961710529657, 0.051665202222453806, 0.09549500540159456, 0.07552142192939675, 0.055731031113576066, 0.07403159906041028, 0.08239272455792414, 0.08701010686181658]
[4.856009244918823, 4.606677770614624, 4.599695682525635, 4.857006549835205, 4.609670400619507, 4.709402322769165, 4.897897481918335, 4.628618240356445, 4.596704006195068, 4.602687835693359, 6.434788227081299, 4.549828290939331, 4.557809352874756, 4.535866975784302, 4.558806657791138, 4.5308802127838135, 4.5647900104522705, 4.5438456535339355, 4.865984201431274, 4.704416275024414, 4.543845891952515, 5.755604267120361, 4.540853261947632]


In [10]:

trmf_rmse_list = []
runtime = []


def process_chunk(start_index, end_index, K, i):
    data = data_without_target.to_numpy().T
    data = data[:, start_index:end_index]
    target_cut = target.iloc[start_index:end_index]

    T_train = data.shape[1]
    st = time.time()
    data_normalized = normalized_data(data, T_train, 0, normalize=True)

    model = trmf(lags, K, lambda_f, lambda_x, lambda_w, alpha, eta)
    model.fit(data_normalized, max_iter=max_iteration)
    et = time.time()
    elapsed_time = et - st

    new_features = model.X.T
    rmse = xgboost_reg_error(new_features, target_cut, testSize)

    return (i, rmse, elapsed_time)


dict_rmse = {}
dict_runtime = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for j in range(data_without_target.shape[0] // interval):
        start_index = j * interval
        end_index = start_index + interval
        K = num_comp_list[j]  # Adjust this if `num_comp_list` should be indexed differently
        futures.append(executor.submit(process_chunk, start_index, end_index, K, j))

    results = [future.result() for future in concurrent.futures.as_completed(futures)]
    results.sort()  # Ensure results are in the correct order

    for i, rmse, elapsed_time in results:
        trmf_rmse_list.append(rmse)
        runtime.append(elapsed_time)

dict_rmse['trmf_rmse'] = trmf_rmse_list
dict_runtime['TRMF_runtime'] = runtime


In [11]:
print(trmf_rmse_list)
print(runtime)

[0.08398931604651906, 0.07104994845827717, 0.05598144593316495, 0.08655605979159094, 0.06900955218685202, 0.06838746598877037, 0.048414076726112365, 0.0767307535529995, 0.05814300471415974, 0.08082596291628372, 0.05178493502720175, 0.05080749884785462, 0.09181315411854704, 0.08391745553114037, 0.047969086914259565, 0.04224721226689247, 0.051665202222453806, 0.09502077900165698, 0.07585843136132302, 0.055731031113576066, 0.07403159906041028, 0.08068090659596167, 0.08700964517251003]
[47.17481350898743, 47.74229454994202, 46.78685021400452, 47.13491916656494, 47.23465275764465, 47.3214213848114, 47.67647099494934, 47.822080850601196, 44.36732316017151, 44.305487394332886, 47.29150056838989, 44.915854930877686, 45.160202503204346, 45.11632061004639, 45.369641065597534, 44.77224063873291, 37.875685930252075, 37.38599753379822, 37.537591218948364, 37.20547962188721, 36.97509551048279, 40.73304343223572, 36.89032316207886]
