In [None]:
!pip install mljar-supervised



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML

# Load your dataset
df = pd.read_csv("X_y.csv")
X = df.drop(columns="GENUS")
y = df["GENUS"]

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, random_state=42)

# Initialize AutoML
automl = AutoML(
    mode="Compete",
    total_time_limit=180,   # total 10 minutes
    eval_metric="f1",
    algorithms=["LightGBM", "Xgboost"],
    random_state=42
)

# Train
automl.fit(X_train, y_train)

# Evaluate
print("\nEvaluation on test set:")
print(automl.report())

AutoML directory: AutoML_3
The task is multiclass_classification with evaluation metric f1
AutoML will use algorithms: ['LightGBM', 'Xgboost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.640052 trained in 3.39 seconds
Disable stacking for split validation
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 2 models
There was an error during 2_Default_LightGBM training.
Please check AutoML_3/errors.md for details.
3_Default_Xgboost f1 0.819704 trained in 7.87 seconds
* Step not_so_random will try to check up to 18 models
There was an error durin

In [None]:
# This will print the leaderboard of trained models
leaderboard = automl.report()
print(leaderboard)


<IPython.core.display.HTML object>


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


# === LOAD YOUR TRAINING DATA ===
df = pd.read_csv("X_y.csv")
X = df.drop(columns="GENUS")
y = df["GENUS"]

# === LABEL ENCODE THE TARGET ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# === FEATURE ENGINEERING ===
X['HT_log'] = np.log1p(X['HT'])
X['DIA_log'] = np.log1p(X['DIA'])
X['HT_DIA_ratio'] = X['HT'] / (X['DIA'] + 1)
X['HT_minus_DIA'] = X['HT'] - X['DIA']
X['HT_times_DIA'] = X['HT'] * X['DIA']
X['LAT_LON_sum'] = X['LAT'] + X['LON']
X['LAT_LON_diff'] = X['LAT'] - X['LON']
X['ecosubcode_richness'] = X[[c for c in X.columns if 'ECOSUBCD_' in c]].sum(axis=1)
X['pft_type_count'] = X[[c for c in X.columns if 'PFT_' in c]].sum(axis=1)
X = X.drop(columns=["HT", "DIA"])

# === STANDARDIZE NUMERIC COLUMNS ===
numeric_cols = ['LAT', 'LON', 'HT_log', 'DIA_log', 'HT_DIA_ratio', 'HT_minus_DIA',
                'HT_times_DIA', 'LAT_LON_sum', 'LAT_LON_diff',
                'ecosubcode_richness', 'pft_type_count']
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# === TRAIN/TEST SPLIT ===
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, random_state=42)

# === RUN MLJAR AutoML ===
automl = AutoML(
    mode="Compete",
    total_time_limit=600,  # 10 minutes
    eval_metric="f1",
    algorithms=['Baseline', 'CatBoost', 'Decision Tree', 'Extra Trees', 'Nearest Neighbors', 'LightGBM', 'Linear', 'Neural Network', 'Random Forest', 'Xgboost'],
    random_state=42
)
automl.fit(X_train, y_train)

# === EVALUATE ===
print("Evaluation on test set:")
print(automl.report())

# === PREDICT ON NEW DATA ===
X_new = pd.read_csv("combined_df.csv")

# Feature engineering on test set
X_new['HT_log'] = X_new['HT']
X_new['DIA_log'] = X_new['DIA']
X_new['HT_DIA_ratio'] = X_new['HT'] / (X_new['DIA'] + 1)
X_new['HT_minus_DIA'] = X_new['HT'] - X_new['DIA']
X_new['HT_times_DIA'] = X_new['HT'] * X_new['DIA']
X_new['LAT_LON_sum'] = X_new['LAT'] + X_new['LON']
X_new['LAT_LON_diff'] = X_new['LAT'] - X_new['LON']
X_new['ecosubcode_richness'] = X_new[[c for c in X_new.columns if 'ECOSUBCD_' in c]].sum(axis=1)
X_new['pft_type_count'] = X_new[[c for c in X_new.columns if 'PFT_' in c]].sum(axis=1)
X_new = X_new.drop(columns=["HT", "DIA"])
X_new[numeric_cols] = scaler.transform(X_new[numeric_cols])
X_new = X_new[X_train.columns]  # align column order

# Make predictions
preds = automl.predict(X_new)
pred_labels = label_encoder.inverse_transform(preds)

# Show prediction distribution
dist = pd.Series(pred_labels).value_counts(normalize=True) * 100
print("\nPredicted GENUS Distribution:\n")
print(dist.sort_values(ascending=False))


Linear algorithm was disabled.
AutoML directory: AutoML_5
The task is multiclass_classification with evaluation metric f1
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Decision Tree', 'Extra Trees', 'Nearest Neighbors', 'LightGBM', 'Neural Network', 'Random Forest', 'Xgboost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.650998 trained in 2.67 seconds
Disable stacking for split validation
* Step simple_algorithms will try to check up to 3 models
2_Baseline f1 0.276883 trained in 0.83 seconds
3_DecisionTree f1 0.51642 trained in 0.82 seconds
4_DecisionTree f1 0.522859 trained in 1.13 seconds
* St