### Construct graphs machine learning

In [None]:
# File management
import glob # Determine what files exist within certain directories.
import json
import os.path
from os import path, sep, listdir, makedirs, getcwd
import re
# Graphs
import pandas as pd # Standard format for the training and testing data.
import matplotlib.pyplot as plt # Graph plotting.
import seaborn as sns
from Bio import Entrez
from ete4 import NCBITaxa
import math
from collections import Counter
import warnings # Ignore warnings, don't read too much into this. 

In [None]:
#                _                               _       _     _           
#               (_)                             (_)     | |   | |          
#   __ _ ___ ___ _  __ _ _ __   __   ____ _ _ __ _  __ _| |__ | | ___  ___ 
#  / _` / __/ __| |/ _` | '_ \  \ \ / / _` | '__| |/ _` | '_ \| |/ _ \/ __|
# | (_| \__ \__ \ | (_| | | | |  \ V / (_| | |  | | (_| | |_) | |  __/\__ \
#  \__,_|___/___/_|\__, |_| |_|   \_/ \__,_|_|  |_|\__,_|_.__/|_|\___||___/
#                   __/ |                                                  
#                  |___/                                                   
Entrez.email = ""
ncbi = NCBITaxa()
phenotypes = ["gram", "motility", "spore", "oxygen","temperature"]
metrics = ['Accuracy_data', 'AUC', 'F1', 'Precision', 'Recall', 'Matthews']
bar_width = 0.15 
classifiers = ["DecisionTreeClassifier", "GradientBoostingClassifier", "RandomForestClassifier"]
dir_with_multiple_phenotype_outputs = path.abspath("../data/output_MI")


In [None]:

def rgb_to_hex(rgb_color):
    """
    Convert RGB to hex.
    
    Parameters:
    - rgb_color: tuple representing rgb values.
    
    Returns:
    - String of color hex. 
    """
    return '#%02x%02x%02x' % rgb_color

def interpolate_color(color_start_rgb, color_end_rgb, t):
    """
    Interpolate between two RGB colors.
    
    Parameters:
    - color_start_rgb: Tuple of integers representing the starting RGB color.
    - color_end_rgb: Tuple of integers representing the ending RGB color.
    - t: Float representing the interpolation factor between 0 and 1.
    
    Returns:
    - A tuple representing the interpolated RGB color.
    """
    return tuple(int(start_val + (end_val - start_val) * t) for start_val, end_val in zip(color_start_rgb, color_end_rgb))


def create_gradient_image():
    width = 800
    height = 100

    gradient = np.linspace(0, 1, width)
    gradient_rgb = np.zeros((height, width, 3))
    gradient_rgb[:, :, 0] = np.flip(gradient)
    gradient_rgb[:, :, 1] = gradient

    # Create the figure and display the gradient
    plt.figure(figsize=(8, 2))
    plt.imshow(gradient_rgb, aspect='auto')
    plt.axis('off')  # Turn off axis
    plt.savefig('red_to_green_gradient.png', bbox_inches='tight', pad_inches=0)

class TaxonomyFrom:
    def __init__(self, phenotype_file_path: str):
        phenotype_pd = pd.read_csv(phenotype_file_path, delimiter="\t")
        phenotype_pd["?accession"] = phenotype_pd["?accession"].str.replace('.', '_')        
        self.genome_to_taxa =  dict(zip(phenotype_pd['?accession'], phenotype_pd['?taxid']))

    def genome(self, genome_accession: str):
        return self.genome_to_taxa.get(genome_accession) if genome_accession in self.genome_to_taxa else None

class TreeWalk:
    def __init__(self, tree):
        self.tree = tree
        self.taxonomy = ["domain", "kingdom", "phyum", "class", "order", "family", "genus", "subgenus", "specices group", "species", "subspecies"]

    def rank_limit(self, rank: str):
        for node in self.tree.traverse("postorder"):
            if str(node.rank) == rank:
                for child in node.children:
                    print(child.rank)
                    tree.prune(child.name)

    def calculate_confidence_for_tree(self, merged_df: pd.DataFrame):
        """
        Process the entire tree to update taxid_confidence based on the merged_df's 'ConfidencePrediction' column 
        and calculate confidence intervals.
        """
        # Step 1: Initialize a dictionary to store confidence values for each taxid
        taxid_confidence = {}

        # Step 2: Iterate over the lineage column and aggregate confidence values
        for index, row in merged_df.iterrows():
            lineage = row.lineage
            confidence = row.ConfidencePrediction

            if pd.notna(lineage) and pd.notna(confidence):
                taxids = lineage.split(";")
                for taxid in taxids: # Go over all taxids
                    taxid = int(taxid) # ncbi magic. 
                    if taxid not in taxid_confidence:
                        taxid_confidence[str(taxid)] = [confidence]
                    else:
                        taxid_confidence[str(taxid)].append(confidence)

        # Step 3: Calculate the average confidence for each taxid
        for taxid, confidences in taxid_confidence.items():
            taxid_confidence[str(taxid)] = sum(confidences) / len(confidences)

        # Step 4: Check if all nodes are represented in the confidence dict. 
        for node in self.tree.traverse("postorder"):
            assert str(node.taxid) in taxid_confidence

        return taxid_confidence

        
    def calculate_genomes_used(self, merged_df: pd.DataFrame):
        # Initialize a Counter object to count occurrences of each taxid
        taxid_to_abundance = Counter()

        # Iterate over the lineage column
        for lineage in merged_df['lineage']:
            if pd.notna(lineage):
                # Split the lineage string into taxids
                taxids = lineage.split(";")
                # Update the count for each taxid in the lineage
                taxid_to_abundance.update(taxids)

        # Having integers as keys is a disaster in Python so here is my magic. 
        taxid_to_abundance_str_keys = {str(key): value for key, value in taxid_to_abundance.items()}

        # Check 
        for node in self.tree.traverse("postorder"):
            assert str(node.taxid) in taxid_to_abundance_str_keys

        return taxid_to_abundance_str_keys

    @property
    def get_taxid_confidence(self):
        return self.taxid_confidence

    @property
    def get_tree(self):
        return self.tree

# Function to get lineage as semicolon-separated string of names
def get_lineage_string(taxid):
    # Get the lineage taxids for the given taxid
    lineage_taxids = [str(taxid) for taxid in ncbi.get_lineage(taxid)]
    return ";".join(lineage_taxids)


def make_sectional_table(df: pd.DataFrame, merge_column_1: str, merge_column_2: str):
    """
    Sorts a DataFrame by two specified columns and modifies it in place.
    Converts the DataFrame into a sectional table format where unique rows from
    the first column span multiple values in the second column.

    Example Input:
    # Col1  | Col2  | Col3
    # Gram  | DT    | 0.8
    # Gram  | RF    | 0.8
    # Motility | RF | 0.9

    Example Output:
          Col2    | Col3
    Gram
           DT      0.8
           RF      0.8
    Motility
           RF     0.9

    Parameters:
    df : pd.DataFrame
        The pandas DataFrame to be sorted. This input DataFrame is modified in place.

    merge_column_1 : str
        The name of the primary column to sort by.

    merge_column_2 : str
        The name of the secondary column to add hierarchical distinctions.
    """
    df.sort_values(by=[merge_column_1, merge_column_2], inplace=True)
    # Ensure merge col 1 and -2 are nex to each other in the first and second place.
    cols  = df.columns.to_list()
    cols.remove(merge_column_1)
    cols.remove(merge_column_2)

    # Create a new list of columns with merge_column_1 and merge_column_2 at the beginning
    cols_new = [merge_column_1, merge_column_2] + cols

    # Reorder the DataFrame columns
    df = df[cols_new]

    column_1_blocks = {}
    for n_row, row in enumerate(df.iterrows()):
        col_1_value = row[1][merge_column_1]
        if col_1_value not in column_1_blocks:
            column_1_blocks[col_1_value] = n_row

    new_row = [" "] * (df.columns.size - 1)
    new_list = []

    for n_row, row in enumerate(df.iterrows()):
        row = row[1]
        if n_row == column_1_blocks.get(row[merge_column_1]):
            new_row[0] = "\\rowcolor[HTML]{EFEFEF} \n " + row[merge_column_1]
            new_list.append( new_row.copy())

        row_list_new = row.tolist()[1:]
        row_list_new[0] = "\hspace{1em} " + str(row_list_new[0])
        new_list.append(row_list_new)

    new_df = pd.DataFrame(new_list)
    new_df.columns = df.columns[1:]

    latex_table = new_df.reset_index(drop=True).set_index(new_df.columns.to_list()[0]).to_latex()
    latex_table = re.sub(r"^(([0-9]| )*?&)", "", latex_table).replace("l"*len(new_df.columns), "l"*(len(new_df.columns) -1) ) # remove first row
    latex_table = latex_table.replace("nan", "-")
    return latex_table

In [None]:
cwd = getcwd()
cwd

### Functions

In [None]:
dfs = []
for phenotype in phenotypes:
    for classifier in classifiers:
        # Generate the list of scenario output file paths for different iterations
        test_outputs = [f"{dir_with_multiple_phenotype_outputs}/{phenotype}/iteration_{iteration}/protein_domains/{classifier}/{classifier}-test.tsv" for iteration in range(1, 6)]
        
        for iteration, file_path in enumerate(test_outputs):
            # Read the TSV file into a DataFrame
            df = pd.read_csv(file_path, sep='\t').loc[:,["Genomes", "Observation", "ObservedString", "Prediction", "PredictedString", "ConfidencePrediction"]]
            
            # Add a column for the iteration number
            df['iteration'] = iteration + 1
            
            # Add columns for phenotype and classifier to differentiate in the final DataFrame
            df['phenotype'] = phenotype
            df['classifier'] = classifier
            
            # Append the DataFrame to the list
            dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Print or work with the combined DataFrame
combined_df# .to_csv("all_outputs.tsv", sep="\t")

# Get the used genomes and their respective phenotypes.

In [None]:
from migenpro.ml.machine_learning_main import *

for phenotype in phenotypes:
    feature_matrix_path = f"{dir_with_multiple_phenotype_outputs}/{phenotype}/iteration_1/observed_results_before_train_test_split.tsv"
    phenotype_matrix_path = f"{dir_with_multiple_phenotype_outputs}/{phenotype}/phenotype_matrix.tsv"
    phenotype_path = f"{dir_with_multiple_phenotype_outputs}/{phenotype}/phenotype.tsv"

    feature_matrix = FeatureMatrix(feature_matrix_path)
    feature_matrix.load_matrix()
    phenotype_matrix = PhenotypeMatrix(phenotype_matrix_path)
    phenotype_matrix.load_matrix()
    intersect_genomes_phenotype = phenotype_matrix.get_intersected_genomes(feature_matrix.file_df)
    intersect_genomes_feature = feature_matrix.get_intersected_genomes(phenotype_matrix.file_df)
    feature_matrix_subset = feature_matrix.create_subset(intersect_genomes_feature)
    phenotype_matrix_subset = phenotype_matrix.create_subset(intersect_genomes_phenotype)

    # Automatically remove duplicate indexes, keeping the first instance
    phenotype_matrix_subset = phenotype_matrix_subset[~phenotype_matrix_subset.index.duplicated(keep='first')]

    genomes = phenotype_matrix_subset.index.to_list()
    print(len(genomes))

    phenotype_frequencies = phenotype_matrix_subset.value_counts()
    print(phenotype_frequencies)
    


In [None]:
results = []
for phenotype in phenotypes:
    # Generate the list of scenario output file paths for different iterations
    test_outputs = [f"{dir_with_multiple_phenotype_outputs}/{phenotype}/iteration_{iteration}/graphs/test-summary.tsv" for iteration in range(1, 6)]

    for iteration, file_path in enumerate(test_outputs):

        # Load the file and extract metrics
        with open(file_path, "r") as f:
            # Assuming the file content is in the format you provided
            # metrics = list(map(float, content.split(",")[1:7]))  # Adjust indices as needed
            for line in f:
                line = f.readline().split(",")
                classifier = line[0].split("/")[-2]
                # Extract the metrics
                f1_score = line[1]
                accuracy = line[2]
                precision = line[3]
                recall = line[4]
                mcc = line[5]
                auc = line[6]

                # Append to results
                results.append({
                    "Phenotype": phenotype,
                    "iteration:": iteration,
                    "Classifier": classifier,
                    "F1 Score": f1_score,
                    "Accuracy": accuracy,
                    "Precision": precision,
                    "Recall": recall,
                    "MCC": mcc,
                    "AUC": auc,
                })


metrics_df_all = pd.DataFrame(results)

In [None]:
import pandas as pd

# 1. Define the columns you want to aggregate
metric_cols = ["F1 Score", "Accuracy", "Precision", "Recall", "MCC", "AUC"]

# 2. Force convert these columns to numeric types
# errors='coerce' turns un-convertible text into NaN, preventing crashes
for col in metric_cols:
    metrics_df_all[col] = pd.to_numeric(metrics_df_all[col], errors='coerce')

# 3. Now run your original groupby code
summary_df = metrics_df_all.groupby(["Phenotype", "Classifier"]).agg({
    "F1 Score": ["mean", "std"],
    "Accuracy": ["mean", "std"],
    "Precision": ["mean", "std"],
    "Recall": ["mean", "std"],
    "MCC": ["mean", "std"],
    "AUC": ["mean", "std"],
}).reset_index()

# 4. Flatten the MultiIndex columns
summary_df.columns = [" ".join(col).strip() if col[1] else col[0] for col in summary_df.columns.values]

# Optional: Inspect the result
summary_df

In [None]:
formatted_df = summary_df[["Phenotype", "Classifier"]].copy()

for metric in metric_cols:
    mean_col = f"{metric} mean"
    std_col = f"{metric} std"

    # Check if the columns exist in the summary dataframe
    if mean_col in summary_df.columns and std_col in summary_df.columns:
        formatted_df[metric] = summary_df.apply(
            lambda x: f"{x[mean_col]:.2f}Â±{x[std_col]:.2f}", axis=1
        )

formatted_df

In [None]:
print(make_sectional_table(formatted_df, "Phenotype", "Classifier"))

### Feature importance analysis

In [None]:
dir_with_multiple_phenotype_outputs

In [None]:
 ## Get all feature importance metrics per phenotype
for phenotype in phenotypes:
    phenotype_feature_importance_df = pd.DataFrame()
    for classifier in classifiers:
        feature_importance_df = pd.DataFrame()
         # Per iteration
        test_outputs = [f"{dir_with_multiple_phenotype_outputs}/{phenotype}/iteration_{iteration}/protein_domains/{classifier}/gini_feature_importance_summary_{classifier}.tsv" for iteration in range(1, 6)]
        feature_importances_graph_ready = pd.DataFrame()
        for iteration, file_path in enumerate(test_outputs):
            iteration = iteration + 1 # to max actual iteration number used.
            feature_importance_df = pd.read_csv(file_path, delimiter="\t")
            importance_column_name=  f"importance_{classifier}_{iteration}"
            feature_importance_df.rename(columns={'importance': importance_column_name}, inplace=True)

            # Cleanup
            phenotype_feature_importance_df = (pd.concat([phenotype_feature_importance_df, feature_importance_df], ignore_index=True, sort=False))

            # For heatmap
            phenotype_specific_feature_importance_this_iteration = phenotype_feature_importance_df.loc[pd.notna(phenotype_feature_importance_df[f"importance_{classifier}_{iteration}"])]
            filtered_df = phenotype_specific_feature_importance_this_iteration[phenotype_specific_feature_importance_this_iteration[importance_column_name] >= 0.01]
            phenotype_specific_feature_importance_this_iteration.set_index("feature_name", inplace=True)
            # phenotype_specific_feature_importance_this_iteration
            feature_importances_graph_ready = pd.concat([feature_importances_graph_ready, phenotype_specific_feature_importance_this_iteration[f"importance_{classifier}_{iteration}"]], axis = 1, join="outer")

        # heatmap = sns.heatmap(feature_importances_graph_ready, annot=True)
        # plt.title(f"Gini feature importance {classifier} for {phenotype}")
        # plt.xticks(ticks=range(len(feature_importances_graph_ready.columns)),
        #    labels=[f'Iteration {i}' for i in range(1, 6)],
        #    rotation=45)
        # plt.savefig(f"{phenotype}_{classifier}", dpi=300, bbox_inches='tight')
        # plt.close()

phenotype_feature_importance_df


In [None]:
# Get all feature importance metrics per phenotype
# for phenotype in phenotypes:
phenotype = "motility"
print(f"\n=== Processing {phenotype} ===")

for classifier in classifiers:
    print(f"\nClassifier: {classifier}")

    # Store all iterations data for this classifier
    all_iterations_data = pd.DataFrame()

    # Per iteration
    test_outputs = [f"{dir_with_multiple_phenotype_outputs}/{phenotype}/iteration_{iteration}/protein_domains/{classifier}/gini_feature_importance_summary_{classifier}.tsv" for iteration in range(1, 6)]

    for iteration, file_path in enumerate(test_outputs):
        iteration_num = iteration + 1  # to match actual iteration number used

        try:
            feature_importance_df = pd.read_csv(file_path, delimiter="\t")

            # Sort by importance (descending) and add rank
            feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).reset_index(drop=True)
            feature_importance_df[f'rank_iter_{iteration_num}'] = range(1, len(feature_importance_df) + 1)

            # Select only domain name and rank for this iteration
            iteration_ranks = feature_importance_df[['feature_name', f'rank_iter_{iteration_num}']]

            # Join with accumulated data
            if all_iterations_data.empty:
                all_iterations_data = iteration_ranks
            else:
                all_iterations_data = all_iterations_data.merge(iteration_ranks, on='feature_name', how='outer')

        except FileNotFoundError:
            print(f"File not found: {file_path}")
            continue

    # Calculate rank sum
    if not all_iterations_data.empty:
        rank_cols = [col for col in all_iterations_data.columns if col.startswith('rank_iter_')]
        all_iterations_data[rank_cols] = all_iterations_data[rank_cols].fillna(1000)

        # Calculate rank sum
        all_iterations_data['rank_sum'] = all_iterations_data[rank_cols].sum(axis=1)

        # Sort by rank sum (lower is better)
        result_df = all_iterations_data[['feature_name', 'rank_sum']].sort_values('rank_sum')

        # Create the simple table format you want
        print(f"\n| Domain | Rank Sum |")
        print(f"|--------|----------|")
        for _, row in result_df.head(20).iterrows():  # Show top 20
            print(f"| {row['feature_name']} | {int(row['rank_sum'])} |")

        # Also save to file
        result_df.to_csv(f"{phenotype}_{classifier}_rank_sums.tsv", sep='\t', index=False)

print("Analysis completed!")


In [None]:
# phenotype = phenotypes[1]
# print(phenotype)
# phenotype_feature_importance_df = pd.DataFrame()
# for classifier in classifiers:
#     feature_importance_df = pd.DataFrame()
#      # Per iteration
#     test_outputs = [f"{dir_with_multiple_phenotype_outputs}/{phenotype}_output/mloutput/iteration_{iteration}/gini_feature_importance_summary__iteration_{iteration}{classifier}_{phenotype}.tsv" for iteration in range(1, 6)]
#     for iteration, file_path in enumerate(test_outputs):
#         iteration = iteration + 1 # to max actual iteration number used.
#         print(iteration)
#         feature_importance_df = pd.read_csv(file_path, delimiter="\t")
#         feature_importance_df.rename(columns={'importance': f"importance_{classifier}_{iteration}"}, inplace=True)
#         phenotype_feature_importance_df = (pd.concat([phenotype_feature_importance_df, feature_importance_df], ignore_index=True, sort=False))


In [None]:
# warnings.filterwarnings('ignore')
# classifiers = ["DecisionTreeClassifier", "GradientBoostingClassifier", "RandomForestClassifier"]
# metrics = ['Accuracy_data', 'AUC', 'F1', 'Precision', 'Recall', 'Matthews']
#
# for phenotype in phenotypes:
#
#     bar_width = 0.15  # Width of the bars
#
#     taxonomy_from = TaxonomyFrom(os.path.join(dir_with_multiple_phenotype_outputs, f"{phenotype}", "phenotype.tsv"))
#     for classifier in classifiers:
#         #######################
#         ## Phylogenetic tree ##
#         #######################
#
#         # Get the list of taxids and create the taxid set with the entire lineage
#         # result = pd.concat([prediction_and_probability_df, genome_tax_df], axis=1)
#         summary_file =os.path.join(dir_with_multiple_phenotype_outputs, f"{phenotype}", "iteration_1", "protein_domains", classifier, f"{classifier}-scenario.tsv")
#         if not path.isfile(summary_file):
#             raise FileNotFoundError(summary_file)
#
#         prediction_and_probability_df = pd.read_csv(summary_file, delimiter="\t")
#         prediction_and_probability_df.drop("Unnamed: 0", inplace=True, axis=1)
#         genomes = prediction_and_probability_df["Genomes"].to_list()
#         prediction_and_probability_df.set_index("Genomes", inplace=True)
#         genome_tax = {}
#
#         for genome in genomes:
#             genome_tax[genome] = taxonomy_from.genome(genome)
#
#         genome_tax_df = pd.DataFrame.from_dict(genome_tax, orient='index', columns=["taxid"])
#         # result = pd.concat([prediction_and_probability_df, genome_tax_df], axis=1)
#
#         merged_df = prediction_and_probability_df.join(genome_tax_df)
#         total_counts = len(merged_df.index)
#
#         merged_df['lineage'] = merged_df.taxid.apply(get_lineage_string)
#         merged_df.to_csv("merged.csv")
#
#         taxids = merged_df.taxid.to_list()
#         tree = ncbi.get_topology(taxids, rank_limit="order")
#         taxid_set = set()
#         for taxid in taxids:
#             lineage = ncbi.get_lineage(taxid)
#             for lineage_taxid in lineage:
#                 taxid_set.add(lineage_taxid)
#
#         # Retrieve the taxonomic names for all relevant taxids
#         taxid_to_name = ncbi.get_taxid_translator(taxids=taxid_set)
#
#         ncbi.annotate_tree(tree, tax2name=taxid_to_name)
#
#         tw_pruned = TreeWalk(tree)
#         taxid_confidence = tw_pruned.calculate_confidence_for_tree(merged_df)
#         counts_per_taxid = tw_pruned.calculate_genomes_used(merged_df)
#
#         counts_df = pd.DataFrame(list(counts_per_taxid.items()), columns=["taxid", "genome_count"])
#         counts_df.to_csv("counts.csv", index=False)

In [None]:
import glob
import os
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize a list to store hyperparameters
hyperparameters = []

# Loop over phenotypes
for phenotype in phenotypes:
    pattern = os.path.join(dir_with_multiple_phenotype_outputs, phenotype, "iteration_*", "optimised_params.json")
    phenotype_optimal_params_json = glob.glob(pattern)
    if not (phenotype_optimal_params_json):
        raise FileNotFoundError(f"No JSON files found for phenotype {phenotype}")

    for param_json_path in phenotype_optimal_params_json:
        with open(param_json_path, 'r') as f:
            params = json.load(f)

        # Extract hyperparameters
        dt_params = params.get("DecisionTreeClassifier", {})
        rt_params = params.get("RandomForestClassifier", {})
        gb_params = params.get("GradientBoostingClassifier", {})

        iteration = os.path.basename(os.path.dirname(param_json_path))

        hyperparameters.append({
            "Phenotype": phenotype,
            "Iteration": iteration,
            "Model": "DT",
            "max_depth": dt_params.get("max_depth"),
        })
        hyperparameters.append({
            "Phenotype": phenotype,
            "Iteration": iteration,
            "Model": "RF",
            "max_depth": rt_params.get("max_depth"),
            "n_estimators": rt_params.get("n_estimators"),
            "min_samples_leaf": rt_params.get("min_samples_leaf"),
        })
        hyperparameters.append({
            "Phenotype": phenotype,
            "Iteration": iteration,
            "Model": "GB",
            "max_depth": gb_params.get("max_depth"),
            "n_estimators": gb_params.get("n_estimators"),
            "learning_rate": gb_params.get("learning_rate"),
        })

# Convert to DataFrame
df = pd.DataFrame(hyperparameters)
df


In [None]:
# Summary statistics for each hyperparameter
summary_stats = df.groupby(["Phenotype", "Model"]).agg({
    "max_depth": ["mean", "median", "min", "max", "std"],
    "n_estimators": ["mean", "median", "min", "max", "std"],
    "min_samples_leaf": ["mean", "median", "min", "max", "std"],
    "learning_rate": ["mean", "median", "min", "max", "std"]
}).round(2)

# Display summary statistics
print(summary_stats)


In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(
    data=df,
    x="Phenotype",
    y="max_depth",
    hue="Model",
    palette="Set2",
    width=0.6,
)
plt.title("Distribution of Optimal max_depth per Phenotype")
plt.xlabel("Phenotype")
plt.ylabel("max_depth")
plt.xticks(rotation=45)
plt.legend(title="Model", loc="upper right")
plt.tight_layout()
plt.savefig("max_depth_per_phenotype.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(
    data=df[df["Model"].isin(["RT", "GB"])],
    x="Phenotype",
    y="n_estimators",
    hue="Model",
    palette="Set2",
    width=0.6,
)
plt.title("Distribution of Optimal n_estimators per Phenotype")
plt.xlabel("Phenotype")
plt.ylabel("n_estimators")
plt.xticks(rotation=45)
plt.legend(title="Model", loc="upper right")
plt.tight_layout()
plt.savefig("n_estimators_per_phenotype.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(
    data=df[df["Model"] == "GB"],
    x="Phenotype",
    y="learning_rate",
    color="lightblue",
    width=0.6,
)
plt.title("Distribution of Optimal learning_rate per Phenotype (GB)")
plt.xlabel("Phenotype")
plt.ylabel("learning_rate")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("learning_rate_per_phenotype.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(
    data=df[df["Model"] == "RT"],
    x="Phenotype",
    y="min_samples_leaf",
    color="lightgreen",
    width=0.6,
)
plt.title("Distribution of Optimal min_samples_leaf per Phenotype (RT)")
plt.xlabel("Phenotype")
plt.ylabel("min_samples_leaf")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("min_samples_leaf_per_phenotype.png", dpi=300, bbox_inches="tight")
plt.show()
