# Import Libraries

In [1]:
import optuna
import os
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [2]:
def read_file(*args,file_type="csv")->pd.DataFrame:
    """:parameter input directories in sequence
        :return DataFrame"""
    
    path=os.path.join(*args)
    if file_type == "tsv":
        return pd.read_csv(path, sep="\t")
    return pd.read_csv(path)

# Experiments

In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

In [4]:
mlflow.set_experiment("DifferentModelsExperiments2")

<Experiment: artifact_location='/home/spynom/mlruns/14', creation_time=1729323387786, experiment_id='14', last_update_time=1729323387786, lifecycle_stage='active', name='DifferentModelsExperiments2', tags={}>

In [5]:
df=read_file("..","data","processed","cleaned.csv").dropna(how="any").drop_duplicates()

X=df.comment
y=df.category+1

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

ngram_range = (1, 3)  # Trigram setting
max_features = 10000  # Set max_features to 1000 for TF-IDF

# Vectorization using TF-IDF, fit on training data only
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
vectorizer.fit(X)  # Fit on all vocab data
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)  # Transform test data

In [6]:

class_weights = compute_class_weight('balanced', classes=np.array([0, 1, 2]), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

In [9]:
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_BalanceWeight_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        train_y_pred=model.predict(X_train)
        test_y_pred = model.predict(X_test)

        # Log accuracy
        train_accuracy = accuracy_score(y_train,train_y_pred)
        test_accuracy = accuracy_score(y_test, test_y_pred)
        
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, test_y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [10]:
# Step 6: Optuna objective function for LightGBM
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 10)

    model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42,class_weight=class_weights_dict)
    return accuracy_score(y_test, model.fit(X_train, y_train).predict(X_test))


# Step 7: Run Optuna for LightGBM, log the best model only
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lightgbm, n_trials=30)

    # Get the best parameters and log only the best model
    best_params = study.best_params
    best_model = LGBMClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42,class_weight=class_weights_dict)

    # Log the best model with MLflow, passing the algo_name as "LightGBM"
    log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test)

# Run the experiment for LightGBM
run_optuna_experiment()

[I 2024-10-19 13:14:25,188] A new study created in memory with name: no-name-a5635b10-26db-4693-9b36-86a43be2a134


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.755771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:14:54,776] Trial 0 finished with value: 0.6370033112582781 and parameters: {'n_estimators': 286, 'learning_rate': 0.001889661700927994, 'max_depth': 10}. Best is trial 0 with value: 0.6370033112582781.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.633151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:15:04,097] Trial 1 finished with value: 0.5281456953642384 and parameters: {'n_estimators': 216, 'learning_rate': 0.00011892189705799174, 'max_depth': 4}. Best is trial 0 with value: 0.6370033112582781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.499375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:15:14,032] Trial 2 finished with value: 0.5064845474613686 and parameters: {'n_estimators': 112, 'learning_rate': 0.0006683815139835946, 'max_depth': 3}. Best is trial 0 with value: 0.6370033112582781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.915761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:15:32,621] Trial 3 finished with value: 0.6436258278145696 and parameters: {'n_estimators': 234, 'learning_rate': 0.005186165476646975, 'max_depth': 7}. Best is trial 3 with value: 0.6436258278145696.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.528729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:15:51,758] Trial 4 finished with value: 0.5590507726269316 and parameters: {'n_estimators': 203, 'learning_rate': 0.0010407194798604986, 'max_depth': 5}. Best is trial 3 with value: 0.6436258278145696.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.612474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:16:21,361] Trial 5 finished with value: 0.6368653421633554 and parameters: {'n_estimators': 207, 'learning_rate': 0.0026596295894336193, 'max_depth': 10}. Best is trial 3 with value: 0.6436258278145696.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.650497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:16:31,150] Trial 6 finished with value: 0.5491169977924945 and parameters: {'n_estimators': 60, 'learning_rate': 0.0017749569128772155, 'max_depth': 5}. Best is trial 3 with value: 0.6436258278145696.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.661543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:17:04,570] Trial 7 finished with value: 0.5816777041942605 and parameters: {'n_estimators': 280, 'learning_rate': 0.00018404266728357715, 'max_depth': 8}. Best is trial 3 with value: 0.6436258278145696.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.751940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:17:31,928] Trial 8 finished with value: 0.7644867549668874 and parameters: {'n_estimators': 198, 'learning_rate': 0.03372978708595947, 'max_depth': 10}. Best is trial 8 with value: 0.7644867549668874.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.763436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:17:58,406] Trial 9 finished with value: 0.6196192052980133 and parameters: {'n_estimators': 225, 'learning_rate': 0.002919031095872878, 'max_depth': 7}. Best is trial 8 with value: 0.7644867549668874.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.722231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:18:17,895] Trial 10 finished with value: 0.7713852097130243 and parameters: {'n_estimators': 145, 'learning_rate': 0.05618824518288721, 'max_depth': 9}. Best is trial 10 with value: 0.7713852097130243.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.788776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:18:37,719] Trial 11 finished with value: 0.7784216335540839 and parameters: {'n_estimators': 142, 'learning_rate': 0.06503495725418992, 'max_depth': 9}. Best is trial 11 with value: 0.7784216335540839.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.687927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:18:55,106] Trial 12 finished with value: 0.7886313465783664 and parameters: {'n_estimators': 139, 'learning_rate': 0.08698553278295022, 'max_depth': 8}. Best is trial 12 with value: 0.7886313465783664.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.645765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:19:12,561] Trial 13 finished with value: 0.6790838852097131 and parameters: {'n_estimators': 138, 'learning_rate': 0.015154433963992686, 'max_depth': 8}. Best is trial 12 with value: 0.7886313465783664.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.707822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:19:25,229] Trial 14 finished with value: 0.7723509933774835 and parameters: {'n_estimators': 93, 'learning_rate': 0.09884037279368835, 'max_depth': 8}. Best is trial 12 with value: 0.7886313465783664.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.636113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:19:43,878] Trial 15 finished with value: 0.6663907284768212 and parameters: {'n_estimators': 155, 'learning_rate': 0.014105555542612747, 'max_depth': 6}. Best is trial 12 with value: 0.7886313465783664.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.472946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:19:58,432] Trial 16 finished with value: 0.6818432671081678 and parameters: {'n_estimators': 104, 'learning_rate': 0.0189417411986993, 'max_depth': 9}. Best is trial 12 with value: 0.7886313465783664.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.666150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:20:18,468] Trial 17 finished with value: 0.8064293598233996 and parameters: {'n_estimators': 170, 'learning_rate': 0.093899344957864, 'max_depth': 9}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.149466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:20:39,100] Trial 18 finished with value: 0.7138520971302428 and parameters: {'n_estimators': 175, 'learning_rate': 0.028777571203363866, 'max_depth': 6}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.722517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:21:15,498] Trial 19 finished with value: 0.6659768211920529 and parameters: {'n_estimators': 252, 'learning_rate': 0.006151533694979623, 'max_depth': 8}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.927757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:21:35,288] Trial 20 finished with value: 0.6499724061810155 and parameters: {'n_estimators': 178, 'learning_rate': 0.007667581048504931, 'max_depth': 7}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.528256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:21:53,804] Trial 21 finished with value: 0.7927704194260485 and parameters: {'n_estimators': 134, 'learning_rate': 0.08899572615655597, 'max_depth': 9}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.508300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:01,610] Trial 22 finished with value: 0.7864238410596026 and parameters: {'n_estimators': 122, 'learning_rate': 0.08555527749588246, 'max_depth': 9}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.467463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:06,767] Trial 23 finished with value: 0.720060706401766 and parameters: {'n_estimators': 81, 'learning_rate': 0.04638160288478234, 'max_depth': 9}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.415116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:15,865] Trial 24 finished with value: 0.742135761589404 and parameters: {'n_estimators': 169, 'learning_rate': 0.03325802835134618, 'max_depth': 8}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.424443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:22,916] Trial 25 finished with value: 0.7911147902869757 and parameters: {'n_estimators': 127, 'learning_rate': 0.09090460724669557, 'max_depth': 9}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.480985 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:30,902] Trial 26 finished with value: 0.6972958057395143 and parameters: {'n_estimators': 120, 'learning_rate': 0.019154591310912152, 'max_depth': 10}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.412923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:35,881] Trial 27 finished with value: 0.6321743929359823 and parameters: {'n_estimators': 50, 'learning_rate': 0.010018623543687483, 'max_depth': 9}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.469488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:42,954] Trial 28 finished with value: 0.7141280353200883 and parameters: {'n_estimators': 78, 'learning_rate': 0.03965608951879966, 'max_depth': 10}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.503872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-19 13:22:53,723] Trial 29 finished with value: 0.7941501103752759 and parameters: {'n_estimators': 187, 'learning_rate': 0.05971602490898236, 'max_depth': 10}. Best is trial 17 with value: 0.8064293598233996.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.415847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130918
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4418
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


2024/10/19 13:23:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run LightGBM_BalanceWeight_TFIDF_Trigrams at: http://127.0.0.1:5000/#/experiments/14/runs/90bd98917a584c2992fad5c2442ab659.
2024/10/19 13:23:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/14.
