<a href="https://colab.research.google.com/github/raj-coding1/comment-analysis/blob/main/Exp_5_logisticregression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install mlflow dvc dagshub optuna



In [10]:
import pandas as pd
# from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import mlflow
import optuna
import dvc
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

In [11]:
import os
df = pd.read_csv('/content/df.csv')
df.head()
df.dropna(axis = 0,inplace = True)
ngram_choice= (1,3)
max_feat = 1000
vectorizer = TfidfVectorizer(
            max_features=max_feat,
            ngram_range=ngram_choice
        )
X = vectorizer.fit_transform(df['cleaned_comment'])
# df["category"] = df["category"].replace(-1, 2)
y = df['category']
# y.replace(-1,2)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
# import dagshub
#
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow')
dagshub.init(repo_owner='raj-coding1', repo_name='youtube-comment-analysis', mlflow=True)

# dagshub.init(repo_owner='raj-coding1', repo_name='youtube-comment-analysis', mlflow=True)

mlflow.set_experiment('EXP-10: 7_algorithm')


def log_ml_model(model_name, model, X_train, X_test, y_train, y_test):

    try:
        with mlflow.start_run():

            # Tags
            mlflow.set_tag('mlflow.runname', f'{model_name}_tfidf_smote_trigram')
            mlflow.set_tag("experiment_type", "algorithm_comparison")
            # mlflow.set_tag("algo", model_name)
            mlflow.log_param("algo_name", model_name)

            # Train model
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)

            # Classification report
            class_rep = classification_report(y_test, y_pred, output_dict=True)
            for label, info in class_rep.items():
                if isinstance(info, dict):
                    for metric, value in info.items():
                        mlflow.log_metric(f"{label}_{metric}", value)

            # Confusion matrix
            conf_matrix = confusion_matrix(y_test, y_pred)
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
            plt.xlabel("Prediction")
            plt.ylabel("Actual")
            plt.title("Confusion Matrix")

            # Save artifact CORRECTLY
            plt.savefig("ConfusionMatrix.png")
            mlflow.log_artifact("ConfusionMatrix.png",artifact_path= "plots")  # FIXED
            plt.close()

            # Save dataset
            df.to_csv("df1.csv", index=False)
            mlflow.log_artifact("df1.csv")  # FIXED PATH
            mlflow.autolog(disable=True)

            # Log the model (Dagshub supports tracking model artifacts)
            # mlflow.sklearn.log_model(model, f"{model_name}_model")
            # mlflow.sklearn.log_model(model, artifact_path=f"models/{model_name}", registered_model_name=None)


            print(f"accuracy: {accuracy}")

    except Exception as e:
        print("MLflow logging failed:", e)


def objective(trial):

    # hyperparameters to tune
    C = trial.suggest_loguniform("C", 1e-3, 1e2)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = trial.suggest_categorical("solver", ["liblinear", "saga"])
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    # To avoid incompatible combinations:
    if penalty == "l1" and solver not in ["liblinear", "saga"]:
        raise optuna.exceptions.TrialPruned()

    if penalty == "l2" and solver not in ["liblinear", "lbfgs", "saga"]:
        raise optuna.exceptions.TrialPruned()

    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        class_weight=class_weight,
        max_iter=2000
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    return acc

def run_optuna_exp():
  study = optuna.create_study(direction = 'maximize')
  study.optimize(objective,n_trials = 30)
  best = study.best_params

  best_model = LogisticRegression(
      C=best["C"],
      penalty=best["penalty"],
      solver=best["solver"],
      class_weight=best["class_weight"],
      max_iter=2000
    )



  log_ml_model('LogisticRegression', best_model, X_train,X_test,y_train,y_test)
run_optuna_exp()

[I 2025-11-30 11:27:46,444] A new study created in memory with name: no-name-36db9169-8f91-40f6-9234-ad9924c2654e
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-11-30 11:27:47,034] Trial 0 finished with value: 0.798854493386063 and parameters: {'C': 1.7504063688035354, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': None}. Best is trial 0 with value: 0.798854493386063.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-11-30 11:31:49,697] Trial 1 finished with value: 0.786035728896768 and parameters: {'C': 12.671225211722687, 'penalty': 'l1', 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.798854493386063.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-11-30 11:31:50,132] Trial 2 finished with value: 0.5727533069684986 and parameters: {'C': 0.00417803923587362, 'penalty': 'l2', 'solver': 'saga', 'class_weight': None}. Best is trial 0 with value: 0.798854493386063.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-11-30 11:31

accuracy: 0.8028092186008455
üèÉ View run mysterious-mouse-386 at: https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow/#/experiments/9/runs/6bead0257097439b88cd7c710f573dff
üß™ View experiment at: https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow/#/experiments/9
