In [None]:
# import data
ds0 = pd.read_excel("cow data processed.xlsx")

In [None]:
def replace(data):
    behavior_mapping = {
        "lying ruminating": "lying",
        "lying other": "lying",
        "standing ruminating": "standing",
        "standing other": "standing",
        "walking ruminating": "walking",
    }
    data["Classification"] = data["Classification"].replace(behavior_mapping)
    data1 = data[data["Classification"].isin(["lying", "standing", "walking", "eating"])]
    
    return data1

In [None]:
# select only main behavior
final_ds = replace(final_ds0)
pd.value_counts(final_ds["Classification"], sort=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime

def rf_multi_by_sensor(data, n=15, data_name="data"):
    sensor_list = ['Tg1']
    random_seeds = [11, 22, 33, 44, 55, 66, 77, 88, 99, 101]
    all_results = []
    top_features_records = []

    for sensor in sensor_list:
        print(f"\n\n Sensor: {sensor} ")
        sensor_data = data.loc[data['Sensor'] == sensor]

        # Encode behaviors
        behavior_map = {"lying": 0, "standing": 1, "eating": 2, "walking": 3}
        sensor_data = sensor_data.replace({"Classification": behavior_map})
        sensor_data = sensor_data.drop(columns=['Time', 'ID', 'Sensor'])

        for i, seed in enumerate(random_seeds, 1):

            # Split into train/val/test
            train_val_data, test_data = train_test_split(
                sensor_data,
                test_size=0.2,
                stratify=sensor_data["Classification"],
                random_state=seed
            )

            X_train_val = train_val_data.drop(columns=["Classification"])
            y_train_val = train_val_data["Classification"]
            X_test = test_data.drop(columns=["Classification"])
            y_test = test_data["Classification"]

            # Select top N features using Random Forest
            rf_fs = RandomForestClassifier(
                n_estimators=100,
                random_state=seed,
                class_weight="balanced",
                n_jobs=-1
            )
            rf_fs.fit(X_train_val, y_train_val)
            importance_df = pd.DataFrame({
                'Feature': X_train_val.columns,
                'Importance': rf_fs.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            top_features = importance_df.head(n)['Feature'].tolist()

            # Store selected features
            top_features_records.append(pd.DataFrame({
                'sensor': sensor,
                'iteration': i,
                'rank': list(range(1, len(top_features) + 1)),
                'feature': top_features
            }))

            # Reduce features
            X_train_val = X_train_val[top_features]
            X_test = X_test[top_features]

            # Further split train/val
            X_train, X_val, y_train, y_val = train_test_split(
                X_train_val, y_train_val,
                test_size=0.2,
                stratify=y_train_val,
                random_state=seed
            )

            # Define model + hyperparameters
            base_model = RandomForestClassifier(
                random_state=seed,
                class_weight="balanced",
                n_jobs=-1
            )
            rf_params = {
                'n_estimators': [50, 100],
                'max_depth': [None, 10],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
            grid = GridSearchCV(
                estimator=base_model,
                param_grid=rf_params,
                cv=cv,
                n_jobs=-1,
                scoring='accuracy',
                verbose=0
            )
            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_

            # Evaluate and store results
            def extract_metrics(y_true, y_pred, dataset_name):
                accuracy = round(accuracy_score(y_true, y_pred), 4)
                report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
                df = pd.DataFrame(report).T
                df = df[df.index.str.isnumeric()][['precision', 'recall', 'f1-score']].round(4)
                df.rename(columns={'recall': 'sensitivity'}, inplace=True)
                df['accuracy'] = accuracy
                df['dataset'] = dataset_name
                df['sensor'] = sensor
                df['model'] = 'Random Forest'
                df['iteration'] = i
                df.index.name = 'class'
                return df.reset_index()

            val_df = extract_metrics(y_val, best_model.predict(X_val), 'Validation')
            test_df = extract_metrics(y_test, best_model.predict(X_test), 'Test')
            all_results.append(pd.concat([val_df, test_df], ignore_index=True))

    return all_results