<a href="https://colab.research.google.com/github/rafaelperales05/Comment-Sentiment-Analysis/blob/main/experiment_5_diff_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mlflow boto3 awscli optuna xgboost imbalanced-learn

Collecting mlflow
  Downloading mlflow-3.2.0-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.40.5-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.42.5-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.2.0 (from mlflow)
  Downloading mlflow_skinny-3.2.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-tracing==3.2.0 (from mlflow)
  Downloading mlflow_tracing-3.2.0-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.

In [None]:
!aws configure

In [None]:
import mlflow
from google.colab import userdata

mlflow.set_tracking_uri(userdata.get('MLFLOW_SERVER'))


In [None]:
mlflow.set_experiment('Experiment_5_xgboost')

In [None]:
import optuna
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
df = pd.read_csv("/content/reddit_preprocessing.csv")
df.dropna(inplace=True)
df.shape

(36662, 2)

In [None]:
## map to remove negative
df['category'] = df['category'].map({-1: 2, 0:0, 1:1})
X_train,X_test,y_train,y_test = train_test_split(df['clean_comment'],df['category'], test_size=0.2, random_state=42,stratify=df['category'])
n_gram_range = (1,3)
max_features = 10000

## apply optimzed feature set
vectorizer = TfidfVectorizer(ngram_range=n_gram_range,max_features=max_features)
x_train_vec = vectorizer.fit_transform(X_train)
x_test_vec = vectorizer.transform(X_test)

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_vec, y_train)

def log_mlflow(model_name, model, X_train,x_test,y_train,y_test):
    with mlflow.start_run() as run:

      mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigram")
      mlflow.set_tag("experiment_type", "algo_comparison")

      mlflow.log_param("algo_name", model_name)

      model.fit(X_train, y_train)
      y_pred = model.predict(x_test)
      accuracy = accuracy_score(y_test, y_pred)
      mlflow.log_metric("accuracy", accuracy)

      # Log classification report
      classification_rep = classification_report(y_test, y_pred, output_dict=True)
      for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
      mlflow.sklearn.log_model(model, f"{model_name}_model")
def objective_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 10)

    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return accuracy_score(y_test, model.fit(x_train_resampled, y_train_resampled).predict(x_test_vec))


def run_optuna_experiment():
  study = optuna.create_study(direction='maximize')
  study.optimize(objective_xgboost, n_trials=30)
  best_model = XGBClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)
  log_mlflow('XGBoost', best_model, x_train_resampled, x_test_vec, y_train_resampled, y_test)

run_optuna_experiment()
mlflow.end_run()

[I 2025-08-08 16:53:01,811] A new study created in memory with name: no-name-713e4e04-5072-4c80-861c-77c38bb2d3e4
[I 2025-08-08 16:53:54,807] Trial 0 finished with value: 0.5659348152188736 and parameters: {'n_estimators': 98, 'learning_rate': 0.004851633994110567, 'max_depth': 4}. Best is trial 0 with value: 0.5659348152188736.
[I 2025-08-08 17:05:45,232] Trial 1 finished with value: 0.608073094231556 and parameters: {'n_estimators': 284, 'learning_rate': 0.0006569325207359122, 'max_depth': 9}. Best is trial 1 with value: 0.608073094231556.
[I 2025-08-08 17:11:33,555] Trial 2 finished with value: 0.6095731624164735 and parameters: {'n_estimators': 135, 'learning_rate': 0.0004041940094815761, 'max_depth': 9}. Best is trial 2 with value: 0.6095731624164735.
[I 2025-08-08 17:13:01,480] Trial 3 finished with value: 0.5702986499386336 and parameters: {'n_estimators': 73, 'learning_rate': 0.003932208764172781, 'max_depth': 6}. Best is trial 2 with value: 0.6095731624164735.
[I 2025-08-08 17

NameError: name 'best_params' is not defined