# Umineko Tree

This code generate a decision tree model used for Umineko 2023 and Umineko 2024 experiments in Kabushima Island, Hachinohe, Japan.  
  
NOTE: We used anaconda env "umineko-analysis (Python 3.11.9)" (name changed to "umineko-tree-dev" cloned from "umineko-analysis-old").   


In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, precision_score
# from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from IPython.display import display
# from scipy import stats

import sys
sys.path.append("../") # Set parent directory to sys.path
sys.dont_write_bytecode = True
%load_ext autoreload
%autoreload 2
from utils.tree_to_code import tree_to_code_cpp
# from src.tree_to_code import tree_to_code_cpp

RANDOM_SEED = 558
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

palette = sns.color_palette(['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#000000']) # Okabe-Ito
# palette = sns.color_palette(["#ff4554", "#00bbdf", "#bad600", "#f02d7d", "#f8b62e", "#8b26a6","#808080"]) # https://anoiro.com/themes/switch-joycons
# palette = sns.color_palette(["#D81B60", "#1E88E5", "#FFC107", "#004D40"])
sns.set_palette(palette)
sns.set_theme(context='poster', style='ticks', palette=palette, font_scale=1.0)
display(sns.color_palette(palette))

## Functions

In [None]:
# Map label names

label_name_dict = {
    'stationary': 'stationary',
    'ground_stationary': 'stationary',
    'preening': 'stationary',
    'bathing': 'bathing',
    'bathing_poss': 'bathing',
    'flying_active': 'flying',
    'flying_passive': 'flying',
    'foraging': 'foraging',
    'foraging_poss': 'foraging',
    'foraging_fish_poss': 'foraging',
    'poss_foraging': 'foraging',
    'foraging_steal': 'NA',
    'ground_active': 'NA',
    'forgaing_insect': 'NA',
    'foraging_insect_poss': 'NA',
    'foraging_non-fish': 'NA',
    'body_shaking': 'NA',
    'unknown': 'NA',
    }


label_id_dict_binary = {
    'stationary': 1,
    'ground_stationary': 1,
    'preening': 1,
    'bathing': 1,
    'bathing_poss': 1,
    'flying_active': 0,
    'flying_passive': 0,
    'foraging': 0,
    'foraging_poss': 0,
    'foraging_fish_poss': 0,
    'poss_foraging': 0,
    'foraging_steal': np.nan,
    'ground_active': np.nan,
    'forgaing_insect': np.nan,
    'foraging_insect_poss': np.nan,
    'foraging_non-fish': np.nan,
    'body_shaking': np.nan,
    'unknown': np.nan,
    }

# Binary: Flying or Others (Not-flying)
# Used for Umineko 2023, 2024 field experiments
label_id_dict_binary_2 = {
    'stationary': 1,
    'ground_stationary': 1,
    'preening': 1,
    'bathing': 1,
    'bathing_poss': 1,
    'flying_active': 0,
    'flying_passive': 0,
    'foraging': 1,
    'foraging_poss': 1,
    'foraging_fish_poss': 1,
    'poss_foraging': 1,
    'foraging_steal': np.nan,
    'ground_active': np.nan,
    'forgaing_insect': np.nan,
    'foraging_insect_poss': np.nan,
    'foraging_non-fish': np.nan,
    'body_shaking': np.nan,
    'unknown': np.nan,
    }


# Binary: Active (flapping) Flight or Others (including passive flight)
label_id_dict_binary_3 = {
    'stationary': 1,
    'ground_stationary': 1,
    'preening': 1,
    'bathing': 1,
    'bathing_poss': 1,
    'flying_active': 0,
    'flying_passive': 1,
    'foraging': 1,
    'foraging_poss': 1,
    'foraging_fish_poss': 1,
    'poss_foraging': 1,
    'foraging_steal': np.nan,
    'ground_active': np.nan,
    'forgaing_insect': np.nan,
    'foraging_insect_poss': np.nan,
    'foraging_non-fish': np.nan,
    'body_shaking': np.nan,
    'unknown': np.nan,
    }

label_id_dict_multi3 = {
    'stationary': 2,
    'ground_stationary': 2,
    'preening': 2,
    'bathing': 2,
    'bathing_poss': 2,
    'flying_active': 0,
    'flying_passive': 0,
    'foraging': 1,
    'foraging_poss': 1,
    'foraging_fish_poss': 1,
    'poss_foraging': 1,
    'foraging_steal': np.nan,
    'ground_active': np.nan,
    'forgaing_insect': np.nan,
    'foraging_insect_poss': np.nan,
    'foraging_non-fish': np.nan,
    'body_shaking': np.nan,
    'unknown': np.nan,
    }

label_id_dict_multi4 = {
    'stationary': 3,
    'ground_stationary': 3,
    'preening': 3,
    'bathing': 2,
    'bathing_poss': 2,
    'flying_active': 0,
    'flying_passive': 0,
    'foraging': 1,
    'foraging_poss': 1,
    'foraging_fish_poss': 1,
    'poss_foraging': 1,
    'foraging_steal': np.nan,
    'ground_active': np.nan,
    'forgaing_insect': np.nan,
    'foraging_insect_poss': np.nan,
    'foraging_non-fish': np.nan,
    'body_shaking': np.nan,
    'unknown': np.nan,
    }


def change_pandas_label_name(df, label_name_dict, label_id_dict):
    df["label2"] = df["label"].replace(label_name_dict)
    df["label_id"] = df["label"].replace(label_id_dict)
    return df


def get_annot_labels(cm, df_cm, labels):
    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    group_precision_scores = (df_cm / np.sum(df_cm)).values.flatten()

    group_percentages = [
        "{0:.2f}".format(value) for value in group_precision_scores
    ]
    
    annot_labels = [
        f"{v1}\n({v2})" for v1, v2 in zip(group_counts, group_percentages)
    ]
    
    annot_labels = np.asarray(annot_labels).reshape(len(labels), len(labels))
    
    return annot_labels

## Run Training and Test of Decision Tree models 

In [None]:
# label_type_list = ["binary", "binary_2", "binary_3", "multi3", "multi4"]
# label_type_list = ["binary_2", "binary_3"]
label_type_list = ["binary_2"] # Umineko 2023 and 2024 version
# label_type = "binary"
# label_type = "multi3"
# label_type = "multi4"

feature_set_list = ["full", "reduced"]
# feature_set = "full"
# feature_set = "reduced"

for i, label_type in enumerate(label_type_list):
    if label_type == "binary":
        label_id_dict = label_id_dict_binary
        class_labels = ["flying", "others"]
        labels = [0, 1]
    elif label_type == "binary_2":
        label_id_dict = label_id_dict_binary_2
        class_labels = ["flying", "others"]
        labels = [0, 1]
    elif label_type == "binary_3":
        label_id_dict = label_id_dict_binary_2
        class_labels = ["flying", "others"]
        labels = [0, 1]
    elif label_type == "multi3":
        label_id_dict = label_id_dict_multi3
        class_labels = ["flying", "foraging", "stationary"]
        labels = [0, 1, 2]
    elif label_type == "multi4":
        label_id_dict = label_id_dict_multi4
        class_labels = ["flying", "foraging", "bathing", "stationary"]
        labels = [0, 1, 2, 3]

    print("----- Train and Validation Dataset -----")
    df_train = pd.DataFrame()
    path1 = "../data/ACC_features/umineko/Umineko2018_random_noise.csv" # 25Hz
    path2 = "../data/ACC_features/umineko/Umineko2019_random_noise.csv" # 25Hz
    path_list = [path1, path2]
    for i, path in enumerate(path_list):
        df = pd.read_csv(path)
        df_train = pd.concat([df_train, df])
        print(f"{os.path.basename(path)}: {len(df_train)}")
    df_train = change_pandas_label_name(df_train, label_name_dict, label_id_dict)
    df_train = df_train[df_train["label2"] != "NA"]
    print(f"Removed some rows:            {len(df_train)}")
    print(df_train['label2'].value_counts())

    # print("----- Train and Validation Dataset -----")
    print("------------- Test Dataset -------------")
    path3 = "../data/ACC_features/umineko/Umineko2022.csv" # down-sampled to 25Hz
    df_test = pd.read_csv(path3)
    print(f"{os.path.basename(path3)}: {len(df_test)}")
    df_test = change_pandas_label_name(df_test, label_name_dict, label_id_dict)
    df_test = df_test[df_test["label2"] != "NA"]

    for j, feature_set in enumerate(feature_set_list):
        if feature_set == "full":
            # full set of features
            X_columns = [
                'acc_mag_mean', 
                'acc_mag_1c', 
                'acc_mag_mc', 
                'acc_mag_var', 
                'acc_mag_energy', 
                'acc_mag_kurtosis', 
                'acc_mag_crest', 
                'acc_mag_RMS'
                ]
        elif feature_set == "reduced":
            # reduced features
            X_columns = [
                'acc_mag_mean', 
                # 'acc_mag_1c', 
                'acc_mag_mc', 
                'acc_mag_var', 
                # 'acc_mag_energy', 
                'acc_mag_kurtosis', 
                # 'acc_mag_crest', 
                # 'acc_mag_RMS'
                ]
            
        X_train_df = df_train[X_columns]
        y_train_s = df_train['label_id']
        X_train, X_valid, y_train, y_valid = train_test_split(
                    X_train_df, 
                    y_train_s, 
                    test_size=0.1, 
                    random_state=RANDOM_SEED, 
                    stratify=y_train_s)
        X_train, X_valid, y_train, y_valid = X_train.values, X_valid.values, y_train.values, y_valid.values
        X_test = df_test[X_columns].values
        y_test = df_test['label_id'].values

        # SMOTE
        print("Running SMOTE ...")
        sm = SMOTE(
            k_neighbors=5, # default=5
            # k_neighbors=3 # default=5
            random_state=RANDOM_SEED,
        )
        X_train, y_train = sm.fit_resample(X_train, y_train)
        # print(y_test.values_count())


        # Decision Tree
        print("Running Decision Tree Classifier ...")
        model = DecisionTreeClassifier(
            max_depth=5,
            class_weight="balanced"
        )
        model.fit(X_train, y_train)

        # Tree name
        tree_name = f"umineko_tree_{class_labels[0]}_{label_type}_n_features_{model.n_features_in_}"
        print("------------------------------------------------------------")
        print(f"tree_name: {tree_name}")

        # figure settings
        # parameters = {
        #     'font.size': 18,
        #     'axes.labelsize': 18,
        #     'xtick.labelsize': 18,
        #     'ytick.labelsize': 18,
        #     'legend.fontsize': 18,
        #     'figure.titlesize': 18,
        #     "figure.facecolor": "white"
        # }
        # plt.rcParams.update(parameters)

        fig, axes = plt.subplots(1, 2, figsize=(20, 7))
        (ax0, ax1) = axes.flatten() 

        # Validation
        y_valid_pred = model.predict(X_valid)
        y_gt, y_pred = y_valid, y_valid_pred
        report_valid = pd.DataFrame(classification_report(y_gt, y_pred, target_names=class_labels, output_dict=True)).transpose()
        cm = confusion_matrix(y_gt, y_pred, labels=labels)
        df_cm = pd.DataFrame(data=cm, index=class_labels, columns=class_labels)
        display(df_cm)
        annot_labels = get_annot_labels(cm, df_cm, labels)
        sns.heatmap(
            ax = ax0,
            data = df_cm / np.sum(df_cm),
            vmin=0, vmax=1.0,
            square=True, cbar=True, annot=annot_labels, fmt='',
            cmap='Blues'
        )

        # Test
        y_test_pred = model.predict(X_test)
        y_gt, y_pred = y_test, y_test_pred
        report_test = pd.DataFrame(classification_report(y_gt, y_pred, target_names=class_labels, output_dict=True)).transpose()
        gt_pred_dict = {'y_gt': y_gt, 'y_pred': y_pred}
        df_gt_pred = pd.DataFrame(gt_pred_dict)
        save_path = '../output/df_y_gt_y_pred_umineko_2024_tree_binary_2.csv'
        df_gt_pred.to_csv(save_path, index=False)

        cm = confusion_matrix(y_gt, y_pred, labels=labels)
        df_cm = pd.DataFrame(data=cm, index=class_labels, columns=class_labels)
        annot_labels = get_annot_labels(cm, df_cm, labels)
        sns.heatmap(
            ax = ax1,
            data = df_cm / np.sum(df_cm),
            vmin=0, vmax=1.0,
            square=True, cbar=True, annot=annot_labels, fmt='',
            cmap='Blues',
            annot_kws={"size": 18},
        )

        # Decoration
        ax_list = [ax0, ax1]
        fig_title_list = ["Validation", "Test", "Test (10)"]
        for i, ax in enumerate(ax_list):
            ax.set_xlabel("Prediction", labelpad=10)
            ax.set_ylabel("Ground Truth", labelpad=10)
            ax.text(len(class_labels)/2, -0.15, 
                    s=fig_title_list[i], 
                    fontsize=24, fontweight='bold', 
                    va='center', ha='center')
        plt.show()
        plt.close()
        # Uncomment to update the results
        # path = f"../figure/confusion_matrix/umineko/{tree_name}"
        # fig.savefig(path, dpi=350, bbox_inches='tight', pad_inches=0.1)
            
        print("Validation")
        print(report_valid)
        print("Test")
        print(report_test)
        # Uncomment to update the results
        # report_valid.to_csv(f"valid_report_{tree_name}.csv")
        # report_test.to_csv(f"test_report_{tree_name}.csv")
        
        # Save Tree as .cpp code
        print(class_labels)
        feature_names = X_columns
        # dt_now = datetime.datetime.now()
        # dt_now_ = str(dt_now).replace("-", "")[0:8]
        # print(dt_now_)

        # Save model
        # tree_to_code_cpp(model, feature_names, tree_name, class_labels)
        
        # Feature Importance
        print(f"n features: {model.n_features_in_}")
        print(model.feature_importances_)

        fig, ax = plt.subplots(1, 1, figsize=(12, 6))
        sns.barplot(
            # x=model.feature_names_in_, 
            x=X_columns, 
            y=model.feature_importances_,
            color="#4682B4")
        ax.tick_params(axis='x', rotation=90)
        ax.tick_params(axis='y')
        ax.set_xlabel("Features", labelpad=5)
        ax.set_ylabel("Importance", labelpad=5)
        plt.show()
        plt.close()

        # Uncomment to update the results
        # path = f"../figure/feature_importance/umineko/{tree_name}"
        # fig.savefig(path, dpi=350, bbox_inches='tight', pad_inches=0.1)

        # params = model.get_params(deep=True)
        # print(params)