In [21]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import tsfel
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
time = 10
offset = 100
folders = ["LAYING","SITTING","STANDING","WALKING","WALKING_DOWNSTAIRS","WALKING_UPSTAIRS"]
classes = {"WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6}

combined_dir = 'D:\\Code\\ML\\fork-it\\Combined'

In [11]:
X_train=[]
y_train=[]
dataset_dir = os.path.join(combined_dir,"Train")

for folder in folders:
    files = os.listdir(os.path.join(dataset_dir,folder))

    for file in files:

        df = pd.read_csv(os.path.join(dataset_dir,folder,file),sep=",",header=0)
        df = df[offset:offset+time*50]
        X_train.append(df.values)
        y_train.append(classes[folder])

X_train = np.array(X_train)
y_train = np.array(y_train)

In [12]:
X_test=[]
y_test=[]
dataset_dir = os.path.join(combined_dir,"Test")

for folder in folders:
    files = os.listdir(os.path.join(dataset_dir,folder))
    for file in files:

        df = pd.read_csv(os.path.join(dataset_dir,folder,file),sep=",",header=0)
        df = df[offset:offset+time*50]
        X_test.append(df.values)
        y_test.append(classes[folder])

X_test = np.array(X_test)
y_test = np.array(y_test)

In [13]:
X = np.concatenate((X_train,X_test))
y = np.concatenate((y_train,y_test))

# split the data into training and testing sets. Change the seed value to obtain different random splits.
seed = 4
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=seed,stratify=y)

In [14]:
cfg_file = tsfel.get_features_by_domain()
tsfel_train_features = tsfel.time_series_features_extractor(cfg_file, X_train, fs=50, header_names=None)
tsfel_test_features = tsfel.time_series_features_extractor(cfg_file, X_test, fs=50, header_names=None)
print(tsfel_train_features.shape)
print(tsfel_test_features.shape)

*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
(126, 1152)
(54, 1152)


In [16]:
#paths to test data and training data
train_folder = 'D:\\Code\\ML\\fork-it\\Task3\\Dataset_561.csv'
test_folder = 'D:\\Code\\ML\\fork-it\\Task3\\TestDataset_561.csv'
#loading train and test dataset
df_train = pd.read_csv(train_folder)
df_test = pd.read_csv(test_folder)

#seperating features for training and test
dataset_train_features = df_train.iloc[:,:-2] #all columns except last two
dataset_test_features = df_test.iloc[:,:-2] #all columns except last two


In [17]:
#combining tsfel and dataset features for training and test data
train_features_df = pd.concat([tsfel_train_features, dataset_train_features], axis=1)
test_features_df = pd.concat([tsfel_test_features, dataset_test_features], axis=1)

In [18]:
#computing correlation matrix
correlation_matrix = train_features_df.corr()

In [19]:
threshold = 0.9
highly_correlated = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            feature1 = correlation_matrix.columns[i]
            feature2 = correlation_matrix.columns[j]
            highly_correlated.append((feature1, feature2, correlation_matrix.iloc[i, j]))

# Print highly correlated features
print("Highly correlated features:")
for feature1, feature2, corr_value in highly_correlated:
    print(f"{feature1} and {feature2} with correlation coefficient {corr_value:.2f}")

# Detect redundant features
redundant_features = set()
for feature1, feature2, _ in highly_correlated:
    redundant_features.add(feature1)
    redundant_features.add(feature2)

print("\nRedundant features:")
print(redundant_features)

Highly correlated features:
0_Area under the curve and 0_Absolute energy with correlation coefficient 0.98
0_Average power and 0_Absolute energy with correlation coefficient 1.00
0_Average power and 0_Area under the curve with correlation coefficient 0.98
0_ECDF Percentile_0 and 0_Area under the curve with correlation coefficient 0.93
0_ECDF Percentile_1 and 0_Absolute energy with correlation coefficient 0.97
0_ECDF Percentile_1 and 0_Area under the curve with correlation coefficient 0.96
0_ECDF Percentile_1 and 0_Average power with correlation coefficient 0.97
0_FFT mean coefficient_2 and 0_FFT mean coefficient_1 with correlation coefficient 0.90
0_FFT mean coefficient_209 and 0_FFT mean coefficient_208 with correlation coefficient 0.93
0_FFT mean coefficient_211 and 0_FFT mean coefficient_208 with correlation coefficient 0.91
0_FFT mean coefficient_211 and 0_FFT mean coefficient_210 with correlation coefficient 0.96
0_FFT mean coefficient_212 and 0_FFT mean coefficient_208 with corre

In [22]:
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="viridis")
plt.title("Correlation Matrix")
plt.show()