In [None]:
import os
import re
import nltk
import time
import scipy
import dotenv
import mlflow
import string
import dagshub
import logging
import warnings
import pandas as pd
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.getLogger().setLevel(logging.INFO)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")


In [None]:
dotenv.load_dotenv()

CONFIG = {
    "data_path": "sample.csv",
    "test_size": 0.2,
    "mlflow_tracking_uri": os.getenv("DAGSHUB_URI"),
    "dagshub_repo_owner": os.getenv("DAGSHUB_USERNAME"),
    "dagshub_repo_name": os.getenv("DAGSHUB_REPO"),
    "experiment_name": "BoW vs TF-IDF",
}


In [None]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


In [None]:
df = pd.read_csv("sample.csv")
df

In [None]:
def remove_html(text):
    return re.sub(r"<.*?>", " ", text)


def remove_urls(text):
    return re.sub(r"http\S+|www\S+", " ", text)


def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def tokenize(text):
    return word_tokenize(text=text)


def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = remove_html(text=text)
    text = remove_urls(text=text)
    text = remove_punctuations(text=text)
    tokens = tokenize(text=text)
    tokens = remove_stopwords(tokens=tokens)
    tokens = lemmatize_tokens(tokens=tokens)
    return " ".join(tokens)


In [None]:
df["review"] = df["review"].apply(preprocess_text)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})
df

In [None]:
mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])
dagshub.init(
    repo_name=CONFIG["dagshub_repo_name"],
    repo_owner=CONFIG["dagshub_repo_owner"],
    mlflow=True,
)
mlflow.set_experiment(CONFIG["experiment_name"])
clear_output()

In [None]:
vectorizers = {
    "Bag of Words": CountVectorizer(max_features=1000),
    "TF-IDF": TfidfVectorizer(max_features=1000),
}

models = {
    "LogisticRegression": LogisticRegression(),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "XGBoost": XGBClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
}

In [None]:
def log_model_params(model_name, model):
    params_to_log = {}
    if model_name == "LogisticRegression":
        params_to_log["C"] = model.C
    elif model_name == "MultinomialNB":
        params_to_log["alpha"] = model.alpha
    elif model_name == "XGBoost":
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["learning_rate"] = model.learning_rate
    elif model_name == "RandomForest":
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["max_depth"] = model.max_depth
    elif model_name == "GradientBoosting":
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["learning_rate"] = model.learning_rate
        params_to_log["max_depth"] = model.max_depth

    mlflow.log_params(params_to_log)

In [None]:
def run_experiments(df):
    with mlflow.start_run(run_name="All Experiments"):
        t0 = time.time()
        for model_name, model in models.items():
            for vectorizer_name, vectorizer in vectorizers.items():
                run_name = f"{model_name} with {vectorizer_name}"
                with mlflow.start_run(run_name=run_name, nested=True):
                    logging.info(f"Starting run: {run_name}...")
                    try:
                        vectorizer_instance = vectorizer
                        X = vectorizer_instance.fit_transform(df["review"])
                        y = df["sentiment"]

                        X_train, X_test, y_train, y_test = train_test_split(
                            X, y, test_size=CONFIG["test_size"], random_state=42
                        )

                        mlflow.log_params(
                            {
                                "Algorithm": model_name,
                                "Vectorizer": vectorizer_name,
                                "test_size": CONFIG["test_size"],
                            }
                        )

                        model.fit(X_train, y_train)
                        y_hat = model.predict(X_test)

                        log_model_params(model_name=model_name, model=model)

                        mlflow.log_metrics(
                            {
                                "accuracy": accuracy_score(y_test, y_hat),
                                "precision": precision_score(y_test, y_hat),
                                "recall": recall_score(y_test, y_hat),
                                "f1_score": f1_score(y_test, y_hat),
                            }
                        )

                        input_example = (
                            X_test[:5]
                            if not scipy.sparse.issparse(X_test)
                            else X_test[:5]
                        )

                        mlflow.sklearn.log_model(
                            model, "model", input_example=input_example
                        )

                    except Exception as e:
                        logging.error(f"Unexpected error!: {e}", exc_info=True)
                        mlflow.log_param("error", str(e))

                    logging.info(f"Run '{run_name}' execution complete!\n")
        t1 = time.time()
        logging.info(f"Execution time : {t1 - t0:.2f} sec")


run_experiments(df)
clear_output()