In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import optuna
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import dagshub

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dagshub.init(repo_owner='satyajeetrai007', repo_name='Youtube-Comment-Sentiment-Analysis', mlflow=True)
mlflow.set_experiment("Exp 6 - lightGBM Detailed HyperParameter Tuning maxFeature = 10000 class_weights instead of SMOTE")

df = pd.read_csv('data_preprocessed.csv').dropna().drop_duplicates()
df.shape

(36243, 2)

In [3]:
# Step 1: Remap the class labels from [-1, 0, 1] to [2, 0, 1]
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

# Step 2: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

In [4]:

# Step 1: Train-test split (Applied FIRST to prevent data leakage)
# Splitting the raw text data before any processing.
X_text = df['clean_comment']
y = df['category']
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# Step 2: TF-IDF vectorizer setup
# The vectorizer is now fitted ONLY on the training data.
ngram_range = (1, 3)
max_features = 10000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train = vectorizer.fit_transform(X_train_text)
# The test data is transformed using the vocabulary learned from the training data.
X_test = vectorizer.transform(X_test_text)

# SMOTE step has been removed as requested.

# The log_mlflow function remains the same.
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial_number):
    with mlflow.start_run():
        mlflow.set_tag("mlflow.runName", f"Trial_{trial_number}_{model_name}_ClassWeight_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")
        mlflow.log_param("algo_name", model_name)
        for key, value in params.items():
            mlflow.log_param(key, value)
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)
        
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        mlflow.sklearn.log_model(model, f"{model_name}_model")
        return accuracy

# Step 3: Optuna objective function for LightGBM
# Updated to use class_weight='balanced' instead of SMOTE
def objective_lightgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
        'class_weight': 'balanced'  # Use inbuilt class weighting
    }

    model = LGBMClassifier(**params, random_state=42)

    accuracy = log_mlflow("LightGBM", model, X_train, X_test, y_train, y_test, params, trial.number)
    return accuracy

# Step 4: Run Optuna for LightGBM
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lightgbm, n_trials=100)

    best_params = study.best_params
    # Ensure the best model also uses the balanced class weight
    best_params['class_weight'] = 'balanced' 

    best_model = LGBMClassifier(**best_params, random_state=42)

    log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test, best_params, "Best")

    print("\nBest Parameters:")
    print(best_params)

    optuna.visualization.plot_param_importances(study).show()
    optuna.visualization.plot_optimization_history(study).show()

# Run the experiment
run_optuna_experiment()

[I 2025-08-27 20:46:17,636] A new study created in memory with name: no-name-570f5e5b-1170-4299-a813-ebc77815fe71


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.570983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 116043
[LightGBM] [Info] Number of data points in the train set: 28994, number of used features: 2826
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_0_LightGBM_ClassWeight_TFIDF_Trigrams at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/9/runs/c36c042f3e10441f96445d20f2a87ebf
🧪 View experiment at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/9


[W 2025-08-27 20:52:07,005] Trial 0 failed with parameters: {'n_estimators': 956, 'learning_rate': 0.0006241381246684774, 'max_depth': 4, 'num_leaves': 28, 'min_child_samples': 31, 'colsample_bytree': 0.503266512994806, 'subsample': 0.9554206577018688, 'reg_alpha': 0.0007781633504078582, 'reg_lambda': 0.2946732736284399} because of the following error: RestException("INTERNAL_ERROR: Response: {'error': 'unsupported endpoint, please contact support@dagshub.com'}").
Traceback (most recent call last):
  File "c:\Users\satya\OneDrive\Desktop\youtube-comment-senitment-analysis\.venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\satya\AppData\Local\Temp\ipykernel_9692\2745362617.py", line 62, in objective_lightgbm
    accuracy = log_mlflow("LightGBM", model, X_train, X_test, y_train, y_test, params, trial.number)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

RestException: INTERNAL_ERROR: Response: {'error': 'unsupported endpoint, please contact support@dagshub.com'}