In [22]:
import pickle
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

In [3]:
df_balanced = pickle.load(open("data_snapshot/features.pkl","rb"))

In [4]:
keep_features =  pickle.load(open("data_snapshot/features_selected.pkl","rb"))

In [5]:
groups = df_balanced['participantId']
X = df_balanced.drop(['activity','participantId'], axis = 1)
y = df_balanced['activity']

In [6]:
cv = GroupKFold(n_splits=5)

In [7]:
X_manual_features = X[keep_features]

pipeline_manual = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=0))
])

In [8]:
scores_manual = cross_val_score(pipeline_manual, X_manual_features, y, cv=cv, groups=groups, scoring='f1_macro')

In [9]:
pipeline_pca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('clf', RandomForestClassifier(random_state=0))
])

In [10]:
scores_pca = cross_val_score(pipeline_pca, X, y, cv=cv, groups=groups, scoring='f1_macro')

In [11]:
from sklearn.feature_selection import RFECV

In [15]:
estimator = RandomForestClassifier(random_state=0)
cv_groups = GroupKFold(n_splits=5)

rfecv = RFECV(
    estimator=estimator,
    step=1,
    cv=cv_groups,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2
)

In [16]:
# Erstes Fit: Feature Selection
X_reduced = rfecv.fit_transform(X, y, groups=groups)

Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 fe

In [None]:
# Store reduced data
pickle.dump(df_balanced, open("data_snapshot/x_reduced.pkl","wb"))

In [18]:
# Dann zweite Pipeline: nur mit den ausgewählten Features
pipeline_rfecv = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=0))
])

# Cross-Validation
scores_rfecv = cross_val_score(
    pipeline_rfecv,
    X_reduced,
    y,
    cv=cv_groups,
    groups=groups,
    scoring='f1_macro',
    n_jobs=-1)

In [19]:
print("F1 Macro Scores (mean ± std):")
print(f"Manuelle Auswahl:  {np.mean(scores_manual):.3f} ± {np.std(scores_manual):.3f}")
print(f"PCA:              {np.mean(scores_pca):.3f} ± {np.std(scores_pca):.3f}")
print(f"Auto-Selection:   {np.mean(scores_rfecv):.3f} ± {np.std(scores_rfecv):.3f}")


F1 Macro Scores (mean ± std):
Manuelle Auswahl:  0.443 ± 0.096
PCA:              0.482 ± 0.115
Auto-Selection:   0.595 ± 0.185


=> Go with Auto-Selection Feature Set

In [28]:
final_features = X.columns[rfecv.get_support()]

In [36]:
X_feature = pd.DataFrame(X_reduced, columns=final_features)

In [None]:
# s. ChatGPT Manual vs automatic Featrue Selection vs PCA

In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import optuna
import seaborn as sns
import matplotlib.pyplot as plt

In [49]:
y_feature = y['activity']

In [51]:
logo = LeaveOneGroupOut()

models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "MLP": MLPClassifier(max_iter=1000)
}

results = {}

for name, model in models.items():
    y_true_all = []
    y_pred_all = []

    for train_idx, test_idx in logo.split(X_feature, y_feature, groups=groups):
        X_train, X_test = X_feature.iloc[train_idx], X_feature.iloc[test_idx]
        y_train, y_test = y_feature[train_idx], y_feature[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        y_true_all.extend(y_test)
        y_pred_all.extend(y_pred)

    print(f" {name} – LOSO Classification Report:")
    print(classification_report(y_true_all, y_pred_all))
    results[name] = (y_true_all, y_pred_all)


 Random Forest – LOSO Classification Report:
              precision    recall  f1-score   support

     sitting       0.64      0.93      0.76      7610
    standing       0.77      0.69      0.73      7610
     walking       0.80      0.53      0.64      7610

    accuracy                           0.72     22830
   macro avg       0.74      0.72      0.71     22830
weighted avg       0.74      0.72      0.71     22830

 Logistic Regression – LOSO Classification Report:
              precision    recall  f1-score   support

     sitting       0.64      0.84      0.73      7610
    standing       0.47      0.48      0.47      7610
     walking       0.71      0.47      0.57      7610

    accuracy                           0.60     22830
   macro avg       0.61      0.60      0.59     22830
weighted avg       0.61      0.60      0.59     22830

 KNN – LOSO Classification Report:
              precision    recall  f1-score   support

     sitting       0.61      0.87      0.72      761

In [None]:
# Code oben noch verstehen -> Fragen können ob Vorgehensweise so passt

In [None]:
# Sich mit Bayesion Optimizer und Optuna beschäftigen und schonmal vorbereiten

In [None]:
# Fragen wie gut finales Modell dann wirklich sein muss