In [1]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('C:\Projects\Private\PropStar')
sys.path.append('C:\Projects\Private\PropStar\datasets')
from gridsearch.EstimatorSelectionHelper import EstimatorSelectionHelper

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [3]:
search_space = {
    'RandomForestClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__max_depth': [None],  # 'null' is equivalent to None in Python
        'classifier__min_samples_split': [2],
        'classifier__min_samples_leaf': [1],
        },
    'ExtraTreesClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__max_depth': [None],
        'classifier__min_samples_split': [2],
        'classifier__min_samples_leaf': [1],
        },
    'AdaBoostClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__learning_rate': [0.01],
        },
    'GradientBoostingClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__max_depth': [5],
    }
}


# Indoor user movement RSS data

In [4]:
dataset_group = pd.read_csv('../datasets/indoor_user_movement_rss_data/groups/MovementAAL_DatasetGroup.csv')
dataset_group['sequence_ID'] = dataset_group['#sequence_ID']
dataset_group['dataset_ID'] = dataset_group[' dataset_ID']
dataset_group.drop(['#sequence_ID', ' dataset_ID'], axis=1, inplace=True)

In [5]:
dataset_group

Unnamed: 0,sequence_ID,dataset_ID
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
309,310,3
310,311,3
311,312,3
312,313,3


In [6]:
paths = pd.read_csv('../datasets/indoor_user_movement_rss_data/groups/MovementAAL_Paths.csv')
paths['sequence_ID'] = paths['#sequence_ID']
paths['path_ID'] = paths[' path_ID']
paths.drop(['#sequence_ID', ' path_ID'], axis=1, inplace=True)

In [7]:
paths

Unnamed: 0,sequence_ID,path_ID
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
309,310,6
310,311,6
311,312,6
312,313,6


In [8]:
target = pd.read_csv('../datasets/indoor_user_movement_rss_data/MovementAAL_target.csv')
target['class_label'] = target[' class_label']
target['sequence_ID'] = target['#sequence_ID']
target.drop(['#sequence_ID', ' class_label'], axis=1, inplace=True)

In [9]:
target.columns

Index(['class_label', 'sequence_ID'], dtype='object')

In [10]:
target

Unnamed: 0,class_label,sequence_ID
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
309,-1,310
310,-1,311
311,-1,312
312,-1,313


In [11]:
import os 

files = os.listdir('../datasets/indoor_user_movement_rss_data/dataset')
movements_df = pd.DataFrame()

In [12]:
for file in files:
    if file.startswith('.'):
        continue
    seq_id = file.split("_")[2].replace(".csv", "")
    file_df = pd.read_csv(f"../datasets/indoor_user_movement_rss_data/dataset/{file}")
    file_df['seq_id'] = seq_id
    movements_df = pd.concat([movements_df, file_df])

In [13]:
movements_df.reset_index(inplace=True)
movements_df.drop("index", axis=1, inplace=True)

In [14]:
movements_df.reset_index(inplace=True, names='id')

In [15]:
movements_df['RSS_anchor1'] = movements_df['#RSS_anchor1']
movements_df.drop('#RSS_anchor1', axis=1, inplace=True)

In [16]:
movements_df['seq_id'] = pd.to_numeric(movements_df['seq_id'])

In [17]:
len(movements_df['seq_id'].unique())

314

In [18]:
movements_df

Unnamed: 0,id,RSS_anchor2,RSS_anchor3,RSS_anchor4,seq_id,RSS_anchor1
0,0,-0.48,0.28571,0.30,1,-0.90476
1,1,-0.32,0.14286,0.30,1,-0.57143
2,2,-0.28,-0.14286,0.35,1,-0.38095
3,3,-0.20,-0.47619,0.35,1,-0.28571
4,4,-0.20,0.14286,-0.20,1,-0.14286
...,...,...,...,...,...,...
13192,13192,0.28,-0.76190,-0.55,99,0.19048
13193,13193,0.40,-1.00000,-0.75,99,0.19048
13194,13194,0.24,-1.00000,-0.65,99,0.28571
13195,13195,0.20,-0.85714,-0.55,99,0.33333


In [19]:
denormalized_table = target\
.merge(movements_df, how='left', left_on=['sequence_ID'], right_on=['seq_id'])\
.merge(dataset_group, how='left', on=['sequence_ID'])\
.merge(paths, how='left', on=['sequence_ID'])\
.rename(columns=lambda x: x.strip())\
.drop(['seq_id', 'sequence_ID', 'id'], axis=1)

In [20]:
denormalized_table

Unnamed: 0,class_label,RSS_anchor2,RSS_anchor3,RSS_anchor4,RSS_anchor1,dataset_ID,path_ID
0,1,-0.48000,0.28571,0.30000,-0.90476,1,1
1,1,-0.32000,0.14286,0.30000,-0.57143,1,1
2,1,-0.28000,-0.14286,0.35000,-0.38095,1,1
3,1,-0.20000,-0.47619,0.35000,-0.28571,1,1
4,1,-0.20000,0.14286,-0.20000,-0.14286,1,1
...,...,...,...,...,...,...,...
13192,-1,-0.60000,0.73333,1.00000,-0.50000,3,6
13193,-1,-0.46667,0.73333,1.00000,-0.50000,3,6
13194,-1,-0.42222,0.73333,0.36170,-0.50000,3,6
13195,-1,-0.33333,0.55556,-0.31915,-0.40909,3,6


In [21]:
numerical_cols = ['RSS_anchor1', 'RSS_anchor2', 'RSS_anchor3', 'RSS_anchor4']
# categorical_cols = ['dataset_ID', 'path_ID']
# Numerical pipeline
numerical_pipeline = Pipeline([
    # ('imputer', SimpleImputer(strategy='mean')),  # Example: handle missing values
    ('scaler', StandardScaler())  # Scale features
])

# Categorical pipeline
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder())  # Convert categorical data
])

# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'roc_auc_score': make_scorer(roc_auc_score)
}

In [22]:
X = denormalized_table.drop('class_label', axis=1)
y = denormalized_table['class_label']

In [23]:
es = EstimatorSelectionHelper(search_space=search_space)
es.fit(X, y, fe_pipeline=preprocessor, scoring=scoring)

Running GridSearchCV for RandomForestClassifier.
Fitting 10 folds for each of 2 candidates, totalling 20 fits
Running GridSearchCV for ExtraTreesClassifier.
Fitting 10 folds for each of 2 candidates, totalling 20 fits
Running GridSearchCV for AdaBoostClassifier.
Fitting 10 folds for each of 2 candidates, totalling 20 fits
Running GridSearchCV for GradientBoostingClassifier.
Fitting 10 folds for each of 4 candidates, totalling 40 fits


In [24]:
es.summary()

IndexError: list index out of range