<a href="https://colab.research.google.com/github/raj-coding1/comment-analysis/blob/main/Exp_5_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mlflow dvc dagshub optuna



In [None]:
import pandas as pd
# from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import mlflow
import optuna
import dvc
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('/content/df.csv')
df.head()
df.dropna(axis = 0,inplace = True)
ngram_choice= (1,3)
max_feat = 1000
vectorizer = TfidfVectorizer(
            max_features=max_feat,
            ngram_range=ngram_choice
        )
X = vectorizer.fit_transform(df['cleaned_comment'])
df["category"] = df["category"].replace(-1, 2)
y = df['category']
# y.replace(-1,2)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/raj-coding1/youtube-comment-analysis.mlflow')
dagshub.init(repo_owner='raj-coding1', repo_name='youtube-comment-analysis', mlflow=True)

mlflow.set_experiment('EXP-10: 7_algorithm')


def log_ml_model(model_name, model, X_train,X_test,y_train,y_test):
  with mlflow.start_run():
    mlflow.set_tag('mlflow.runname',f'{model_name}_tfidf_smote_trigram')
    mlflow.set_tag("experiment_type", "algorithm comparison")
    mlflow.log_param("algo_name", model_name)


    # Train model

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Correct metric logging
    mlflow.log_metric("accuracy", accuracy)

    # Correct classification report
    class_rep = classification_report(y_test, y_pred, output_dict=True)

    for label, info in class_rep.items():
        if isinstance(info, dict):
            for metric, value in info.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Save confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Prediction")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")

    plt.savefig("ConfusionMatrix.png")
    mlflow.log_artifact("/content/ConfusionMatrix.png")
    plt.close()

    # Save dataset (if needed)
    df.to_csv("df1.csv", index=False)
    mlflow.log_artifact("/content/df1.csv")
    # mlflow.sklearn.log_model(model,f'{model_name}_model')
    print(f"accuracy : {accuracy}")


def objective(trial):

    # hyperparameters to tune
    C = trial.suggest_loguniform("C", 1e-3, 1e2)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])

    if kernel == "rbf":
        gamma = trial.suggest_loguniform("gamma", 1e-4, 1e-1)
    else:
        gamma = "scale"  # Linear kernel ignores gamma

    # SVM model
    model = SVC(
        C=C,
        kernel=kernel,
        gamma=gamma
    )
    return accuracy_score(y_test, model.fit(X_train,y_train).predict(X_test))
def run_optuna_exp():
  study = optuna.create_study(direction = 'maximize')
  study.optimize(objective,n_trials = 30)
  best_params = study.best_params
  best_model = SVC(C = best_params['C'], kernel = best_params['kernel'], gamma = best_params['gamma'])
  log_ml_model('SVM', best_model, X_train,X_test,y_train,y_test)
run_optuna_exp()

[I 2025-11-30 10:39:17,604] A new study created in memory with name: no-name-2e1b2fbc-63ab-4bbf-8c59-fe29ed55f4c8
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-11-30 10:41:06,867] Trial 0 finished with value: 0.4434747033956089 and parameters: {'C': 0.005948321038869683, 'kernel': 'linear'}. Best is trial 0 with value: 0.4434747033956089.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-11-30 10:42:49,714] Trial 1 finished with value: 0.6594845220237283 and parameters: {'C': 0.024364453391820414, 'kernel': 'linear'}. Best is trial 1 with value: 0.6594845220237283.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
  gamma = trial.suggest_loguniform("gamma", 1e-4, 1e-1)
[I 2025-11-30 10:44:55,287] Trial 2 finished with value: 0.4331105959361789 and parameters: {'C': 0.3738178869604331, 'kernel': 'rbf', 'gamma': 0.0008694427645162942}. Best is trial 1 with value: 0.6594845220237283.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
  gamma = trial.suggest_loguniform("gamma", 1e-4