In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset = pd.read_csv('processed.csv')

# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = dataset.dropna()

In [3]:
# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

In [4]:
# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [5]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)

In [6]:
# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

In [7]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Function to optimize LightGBM hyperparameters
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        "objective": "multiclass",
        "num_class": 3,  # Assuming 3 categories (-1, 0, 1)
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    # Define the LightGBM model with the trial parameters
    model = lgb.LGBMClassifier(**param)

    # Perform cross-validation
    scores = cross_val_score(model, X_train_tfidf_cleaned, y_train_cleaned, cv=3, scoring='accuracy')

    # Return the average score across folds
    return scores.mean()

In [9]:
# Create an Optuna study to optimize the hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-05 19:52:04,436] A new study created in memory with name: no-name-d3114a62-c277-4ddc-b1f9-e3804c94fb98


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:52:19,699] Trial 0 finished with value: 0.7825411459100621 and parameters: {'learning_rate': 0.026615516530715175, 'n_estimators': 386, 'max_depth': 15}. Best is trial 0 with value: 0.7825411459100621.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:52:26,321] Trial 1 finished with value: 0.7869213898776793 and parameters: {'learning_rate': 0.09677504621375667, 'n_estimators': 167, 'max_depth': 16}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:52:31,523] Trial 2 finished with value: 0.7556045751494035 and parameters: {'learning_rate': 0.04602298041019203, 'n_estimators': 130, 'max_depth': 10}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:52:40,631] Trial 3 finished with value: 0.7858176620997682 and parameters: {'learning_rate': 0.09506418608389319, 'n_estimators': 250, 'max_depth': 13}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:52:51,520] Trial 4 finished with value: 0.7805407189410368 and parameters: {'learning_rate': 0.0468566967655681, 'n_estimators': 366, 'max_depth': 8}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018438

[I 2024-11-05 19:53:03,719] Trial 5 finished with value: 0.7848519640860664 and parameters: {'learning_rate': 0.07722750698370884, 'n_estimators': 453, 'max_depth': 8}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018912

[I 2024-11-05 19:53:12,233] Trial 6 finished with value: 0.7864385265957021 and parameters: {'learning_rate': 0.07151682777652317, 'n_estimators': 199, 'max_depth': 15}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:53:24,916] Trial 7 finished with value: 0.7740566211461442 and parameters: {'learning_rate': 0.032348086042726744, 'n_estimators': 468, 'max_depth': 7}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021990 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:53:36,969] Trial 8 finished with value: 0.739359799131844 and parameters: {'learning_rate': 0.012188311868385274, 'n_estimators': 269, 'max_depth': 13}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:53:38,718] Trial 9 finished with value: 0.7007654608182046 and parameters: {'learning_rate': 0.06308223197916596, 'n_estimators': 69, 'max_depth': 5}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:53:46,815] Trial 10 finished with value: 0.7863005218129638 and parameters: {'learning_rate': 0.09785132609044482, 'n_estimators': 171, 'max_depth': 20}. Best is trial 1 with value: 0.7869213898776793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:53:55,811] Trial 11 finished with value: 0.7879560617324978 and parameters: {'learning_rate': 0.07501998453296234, 'n_estimators': 199, 'max_depth': 18}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:53:58,996] Trial 12 finished with value: 0.771573313051234 and parameters: {'learning_rate': 0.085107412058568, 'n_estimators': 60, 'max_depth': 19}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:54:05,771] Trial 13 finished with value: 0.7839552613261707 and parameters: {'learning_rate': 0.06403347445943118, 'n_estimators': 145, 'max_depth': 17}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017839 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:54:14,864] Trial 14 finished with value: 0.7857141959849209 and parameters: {'learning_rate': 0.0901728213532136, 'n_estimators': 210, 'max_depth': 17}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:54:28,658] Trial 15 finished with value: 0.7843690508411475 and parameters: {'learning_rate': 0.07995222687311124, 'n_estimators': 333, 'max_depth': 17}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018969 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:54:34,105] Trial 16 finished with value: 0.7772986343557751 and parameters: {'learning_rate': 0.059242270903776886, 'n_estimators': 107, 'max_depth': 19}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:54:45,770] Trial 17 finished with value: 0.7837481970515583 and parameters: {'learning_rate': 0.09754631534754078, 'n_estimators': 307, 'max_depth': 14}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:54:53,776] Trial 18 finished with value: 0.7846105181699516 and parameters: {'learning_rate': 0.07163516246623698, 'n_estimators': 223, 'max_depth': 11}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:55:01,192] Trial 19 finished with value: 0.7864384516512896 and parameters: {'learning_rate': 0.08587907249967001, 'n_estimators': 168, 'max_depth': 17}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:55:06,446] Trial 20 finished with value: 0.7757810742375084 and parameters: {'learning_rate': 0.0554120561010739, 'n_estimators': 105, 'max_depth': 20}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019447 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019369

[I 2024-11-05 19:55:14,588] Trial 21 finished with value: 0.7863005503632162 and parameters: {'learning_rate': 0.07047551492681323, 'n_estimators': 195, 'max_depth': 15}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:55:24,413] Trial 22 finished with value: 0.7858521864925329 and parameters: {'learning_rate': 0.07381629358784042, 'n_estimators': 242, 'max_depth': 15}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:55:37,232] Trial 23 finished with value: 0.7835413540414025 and parameters: {'learning_rate': 0.08365390255500763, 'n_estimators': 296, 'max_depth': 18}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:55:45,533] Trial 24 finished with value: 0.7861280925633444 and parameters: {'learning_rate': 0.06882267627647963, 'n_estimators': 185, 'max_depth': 16}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017599

[I 2024-11-05 19:55:51,288] Trial 25 finished with value: 0.7824376655200885 and parameters: {'learning_rate': 0.08858208632001703, 'n_estimators': 146, 'max_depth': 12}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:56:01,011] Trial 26 finished with value: 0.7793335393234048 and parameters: {'learning_rate': 0.04121036644591612, 'n_estimators': 223, 'max_depth': 14}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021371

[I 2024-11-05 19:56:05,889] Trial 27 finished with value: 0.7692280382893145 and parameters: {'learning_rate': 0.053573408789554244, 'n_estimators': 97, 'max_depth': 18}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019162

[I 2024-11-05 19:56:17,626] Trial 28 finished with value: 0.7848519854987558 and parameters: {'learning_rate': 0.07783918371714749, 'n_estimators': 276, 'max_depth': 16}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:56:33,874] Trial 29 finished with value: 0.7590535241575106 and parameters: {'learning_rate': 0.0163699442522455, 'n_estimators': 408, 'max_depth': 10}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:56:40,981] Trial 30 finished with value: 0.78609361456474 and parameters: {'learning_rate': 0.09362932304973583, 'n_estimators': 177, 'max_depth': 14}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:56:47,813] Trial 31 finished with value: 0.7856797465365689 and parameters: {'learning_rate': 0.08360194168055562, 'n_estimators': 158, 'max_depth': 16}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:56:56,857] Trial 32 finished with value: 0.7855417810104277 and parameters: {'learning_rate': 0.09962550156012528, 'n_estimators': 201, 'max_depth': 19}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017696

[I 2024-11-05 19:57:02,889] Trial 33 finished with value: 0.7861971092298398 and parameters: {'learning_rate': 0.09174836845109054, 'n_estimators': 129, 'max_depth': 18}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:57:13,052] Trial 34 finished with value: 0.7867144183913873 and parameters: {'learning_rate': 0.06550941213470879, 'n_estimators': 238, 'max_depth': 16}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:57:22,594] Trial 35 finished with value: 0.7868868619161331 and parameters: {'learning_rate': 0.06403960367340786, 'n_estimators': 252, 'max_depth': 13}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:57:32,469] Trial 36 finished with value: 0.7779194631638934 and parameters: {'learning_rate': 0.04016453602308126, 'n_estimators': 256, 'max_depth': 12}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:57:43,755] Trial 37 finished with value: 0.7856452364189304 and parameters: {'learning_rate': 0.06417957458988564, 'n_estimators': 334, 'max_depth': 11}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018102 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:57:53,513] Trial 38 finished with value: 0.7834379093392444 and parameters: {'learning_rate': 0.04962104619762131, 'n_estimators': 243, 'max_depth': 13}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:57:56,766] Trial 39 finished with value: 0.74994825266745 and parameters: {'learning_rate': 0.059615010483459206, 'n_estimators': 292, 'max_depth': 3}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018807

[I 2024-11-05 19:58:12,300] Trial 40 finished with value: 0.7832308771836661 and parameters: {'learning_rate': 0.07686530450181815, 'n_estimators': 386, 'max_depth': 15}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:58:21,345] Trial 41 finished with value: 0.7842656311204603 and parameters: {'learning_rate': 0.05802381030208648, 'n_estimators': 227, 'max_depth': 13}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019353

[I 2024-11-05 19:58:31,994] Trial 42 finished with value: 0.7863005396568717 and parameters: {'learning_rate': 0.06893515989395889, 'n_estimators': 269, 'max_depth': 15}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:58:40,660] Trial 43 finished with value: 0.7860936502525555 and parameters: {'learning_rate': 0.06492054579105995, 'n_estimators': 204, 'max_depth': 16}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017985 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:58:53,283] Trial 44 finished with value: 0.7865074790241295 and parameters: {'learning_rate': 0.04708421923061671, 'n_estimators': 322, 'max_depth': 14}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosin

[I 2024-11-05 19:59:07,050] Trial 45 finished with value: 0.7827481209651356 and parameters: {'learning_rate': 0.03266482723333494, 'n_estimators': 339, 'max_depth': 14}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:59:18,139] Trial 46 finished with value: 0.78081662858063 and parameters: {'learning_rate': 0.0445164040907315, 'n_estimators': 356, 'max_depth': 9}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:59:33,467] Trial 47 finished with value: 0.786576445727683 and parameters: {'learning_rate': 0.05041984840182025, 'n_estimators': 412, 'max_depth': 13}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025009 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] St

[I 2024-11-05 19:59:50,015] Trial 48 finished with value: 0.7858176656685498 and parameters: {'learning_rate': 0.053910979599232814, 'n_estimators': 451, 'max_depth': 13}. Best is trial 11 with value: 0.7879560617324978.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57751
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58038
[LightGBM] [Info] Number of data points in the train set: 19329, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017815

[I 2024-11-05 20:00:01,622] Trial 49 finished with value: 0.7699868361923556 and parameters: {'learning_rate': 0.030453031972177233, 'n_estimators': 428, 'max_depth': 7}. Best is trial 11 with value: 0.7879560617324978.


In [10]:
# Extract the best hyperparameters
best_params = study.best_params
best_params

{'learning_rate': 0.07501998453296234, 'n_estimators': 199, 'max_depth': 18}

In [11]:
best_model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 0.1,  # L2 regularization
    learning_rate= 0.08,
    max_depth= 20,
    n_estimators=367
)

In [12]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80971
[LightGBM] [Info] Number of data points in the train set: 28994, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [13]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)

In [14]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

0.8761467889908257

In [15]:
# Generate classification report
report_train = classification_report(y_train_cleaned, y_train_pred)
print(report_train)

              precision    recall  f1-score   support

          -1       0.87      0.81      0.84      6543
           0       0.80      0.98      0.88      9984
           1       0.97      0.83      0.89     12467

    accuracy                           0.88     28994
   macro avg       0.88      0.87      0.87     28994
weighted avg       0.89      0.88      0.88     28994



In [16]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)

In [17]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.7874189543385295

In [18]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.70      0.65      0.68      1657
           0       0.75      0.93      0.83      2393
           1       0.87      0.75      0.81      3199

    accuracy                           0.79      7249
   macro avg       0.78      0.78      0.77      7249
weighted avg       0.79      0.79      0.79      7249



In [None]:
import re
import numpy as np

# Assuming you have pre-trained tfidf_vectorizer and lgbm_model loaded
# tfidf_vectorizer: Your trained TF-IDF vectorizer
# lgbm_model: Your trained LightGBM model

# Function to clean and preprocess a YouTube comment (same as used during training)
def preprocess_comment(comment):
    # Lowercasing
    comment = comment.lower()

    # Remove special characters, URLs, punctuation, and extra spaces
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)  # Remove URLs
    comment = re.sub(r'\W', ' ', comment)  # Remove special characters
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and newlines

    return comment

# Prediction function
def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
    # Step 1: Preprocess the YouTube comment
    cleaned_comment = preprocess_comment(comment)

    # Step 2: Transform the comment using the trained TF-IDF vectorizer
    comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])

    # Step 3: Use the trained LightGBM model to predict the sentiment
    prediction = lgbm_model.predict(comment_tfidf)
    prediction_proba = lgbm_model.predict_proba(comment_tfidf)

    # Step 4: Get the predicted sentiment (label) and probability
    sentiment_class = np.argmax(prediction_proba)
    sentiment_proba = np.max(prediction_proba)

    # Step 5: Return the sentiment label and confidence
    return {
        'sentiment_class': int(prediction[0]),  # -1, 0, or 1 depending on your labels
        'confidence': sentiment_proba
    }

# Example usage:
comment1 = "I absolutely hate this video!"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn’t learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I’ve seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."

result = predict_sentiment(comment10, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

Predicted Sentiment: 0, Confidence: 0.7668742382112842
