<a href="https://colab.research.google.com/github/raj-coding1/comment-analysis/blob/main/Exp_5_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm --quiet


In [None]:
!pip install mlflow dvc dagshub optuna



In [None]:
import pandas as pd
# from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import mlflow
import optuna
import dvc
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('/content/df.csv')
df.head()
df.dropna(axis = 0,inplace = True)
ngram_choice= (1,3)
max_feat = 1000
vectorizer = TfidfVectorizer(
            max_features=max_feat,
            ngram_range=ngram_choice
        )
X = vectorizer.fit_transform(df['cleaned_comment'])
df["category"] = df["category"].replace(-1, 2)
y = df['category']
# y.replace(-1,2)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow')
dagshub.init(repo_owner='raj-coding1', repo_name='youtube-comment-analysis', mlflow=True)

mlflow.set_experiment('EXP-10: 7_algorithm')


def log_ml_model(model_name, model, X_train,X_test,y_train,y_test):
  with mlflow.start_run():
    mlflow.set_tag('mlflow.runname',f'{model_name}_tfidf_smote_trigram')
    mlflow.set_tag("experiment_type", "algorithm comparison")
    mlflow.log_param("algo_name", model_name)


    # Train model

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Correct metric logging
    mlflow.log_metric("accuracy", accuracy)

    # Correct classification report
    class_rep = classification_report(y_test, y_pred, output_dict=True)

    for label, info in class_rep.items():
        if isinstance(info, dict):
            for metric, value in info.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Save confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Prediction")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")

    plt.savefig("ConfusionMatrix.png")
    mlflow.log_artifact("/content/ConfusionMatrix.png")
    plt.close()

    # Save dataset (if needed)
    df.to_csv("df1.csv", index=False)
    mlflow.log_artifact("/content/df1.csv")
    # mlflow.sklearn.log_model(model,f'{model_name}_model')
    print(f"accuracy : {accuracy}")


def objective(trial):
  n_estimators = trial.suggest_int('n_estimators',50,300)
  learning_rate = trial.suggest_float('learning_rate',1e-4,1e-1, log = True)
  max_depth = trial.suggest_int('max_depth',3, 10)
  model = XGBClassifier(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth, random_state = 42)
  return accuracy_score(y_test, model.fit(X_train,y_train).predict(X_test))

def run_optuna_exp():
  study = optuna.create_study(direction = 'maximize')
  study.optimize(objective,n_trials = 30)
  best_params = study.best_params
  best_model = XGBClassifier(n_estimators = best_params['n_estimators'], learning_rate = best_params['learning_rate'], max_depth = best_params['max_depth'])
  log_ml_model('XGBClassifier', best_model, X_train,X_test,y_train,y_test)


run_optuna_exp()

[I 2025-11-30 11:49:13,389] A new study created in memory with name: no-name-99e53754-052a-4644-8688-c16966176c35
[I 2025-11-30 11:49:50,795] Trial 0 finished with value: 0.4331105959361789 and parameters: {'n_estimators': 51, 'learning_rate': 0.0010149688763275483, 'max_depth': 4}. Best is trial 0 with value: 0.4331105959361789.
[I 2025-11-30 11:52:31,076] Trial 1 finished with value: 0.6416200736397109 and parameters: {'n_estimators': 124, 'learning_rate': 0.0024368883307030894, 'max_depth': 8}. Best is trial 1 with value: 0.6416200736397109.
[I 2025-11-30 11:52:50,223] Trial 2 finished with value: 0.4331105959361789 and parameters: {'n_estimators': 86, 'learning_rate': 0.00014754080316191557, 'max_depth': 3}. Best is trial 1 with value: 0.6416200736397109.
[I 2025-11-30 11:59:19,496] Trial 3 finished with value: 0.4329742261011864 and parameters: {'n_estimators': 209, 'learning_rate': 0.0005104634309555274, 'max_depth': 10}. Best is trial 1 with value: 0.6416200736397109.
[I 2025-11

accuracy : 0.7932633301513705
üèÉ View run funny-eel-267 at: https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow/#/experiments/9/runs/70b03668ec7f4e198664c4ebd6c7b9ae
üß™ View experiment at: https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow/#/experiments/9
