In [1]:
import os
import re
import nltk
import time
import scipy
import dotenv
import mlflow
import string
import dagshub
import logging
import warnings
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.getLogger().setLevel(logging.INFO)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")


In [3]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


In [4]:
df = pd.read_csv("sample.csv")
df


Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative
...,...,...
11968,"I have seen tons of trash, in every language, ...",negative
11969,The core issues at play (God & Satan / Good & ...,negative
11970,There is no such a thing as perfect murder.Lie...,positive
11971,"The Cheesiest movie I've ever seen, Not scary,...",negative


In [5]:
def remove_html(text):
    return re.sub(r"<.*?>", " ", text)


def remove_urls(text):
    return re.sub(r"http\S+|www\S+", " ", text)


def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def tokenize(text):
    return word_tokenize(text=text)


def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = remove_html(text=text)
    text = remove_urls(text=text)
    text = remove_punctuations(text=text)
    tokens = tokenize(text=text)
    tokens = remove_stopwords(tokens=tokens)
    tokens = lemmatize_tokens(tokens=tokens)
    return " ".join(tokens)


In [7]:
df["review"] = df["review"].apply(preprocess_text)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})
df

Unnamed: 0,review,sentiment
0,really liked summerslam due look arena curtain...,1
1,many television show appeal quite many differe...,1
2,film quickly get major chase scene ever increa...,0
3,jane austen would definitely approve one gwyne...,1
4,expectation somewhat high went see movie thoug...,0
...,...,...
11968,seen ton trash every language every topic ever...,0
11969,core issue play god satan good evil tremendous...,0
11970,thing perfect murderlieutenant columbo know th...,1
11971,cheesiest movie ive ever seen scary bad 1st mo...,0


In [8]:
dotenv.load_dotenv()

dagshub_uri = os.getenv("DAGSHUB_URI")
dagshub_repo = os.getenv("DAGSHUB_REPO")
dagshub_username = os.getenv("DAGSHUB_USERNAME")


In [9]:
mlflow.set_tracking_uri(dagshub_uri)
dagshub.init(
    repo_name=dagshub_repo,
    repo_owner=dagshub_username,
    mlflow=True,
)
mlflow.set_experiment("LoR Hyperparameter Tunning")
clear_output()

In [10]:
test_size = 0.2
max_iter = 1000

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

In [11]:
def run_experiments(df):
    param_grid = {
        "C": [0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear"],
        "multi_class": ["ovr"],
    }

    logging.info("Starting MLFlow run...")
    with mlflow.start_run(run_name="All Experiments"):
        t0 = time.time()
        try:
            grid_search = GridSearchCV(
                estimator=LogisticRegression(max_iter=max_iter),
                param_grid=param_grid,
                cv=5,
                scoring="f1",
                n_jobs=-1,
            )

            grid_search.fit(X_train, y_train)

            for params, mean_score, std_score in zip(
                grid_search.cv_results_["params"],
                grid_search.cv_results_["mean_test_score"],
                grid_search.cv_results_["std_test_score"],
            ):
                mc_val = params.get("multi_class", "N/A")
                run_name = f"C={params.get('C', 'N/A')} Solver={params.get('solver', 'N/A')} MC={mc_val}"

                with mlflow.start_run(run_name=run_name, nested=True):
                    model = LogisticRegression(max_iter=max_iter, **params)
                    model.fit(X_train, y_train)

                    y_hat = model.predict(X_test)

                    mlflow.log_params(params)
                    mlflow.log_param("max_iter", max_iter)
                    mlflow.log_metrics(
                        {
                            "accuracy": accuracy_score(y_test, y_hat),
                            "precision": precision_score(y_test, y_hat),
                            "recall": recall_score(y_test, y_hat),
                            "f1_score": f1_score(y_test, y_hat),
                            "mean_cv_score": mean_score,
                            "std_cv_score": std_score,
                        }
                    )

                    input_example = (
                        X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5]
                    )

                    mlflow.sklearn.log_model(
                        model, "model", input_example=input_example
                    )

            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_
            best_score = grid_search.best_score_

            best_y_hat = best_model.predict(X_test)

            mlflow.log_params(best_params)
            mlflow.log_metric("best_cv_f1_score", best_score)

            mlflow.log_metrics(
                {
                    "best_accuracy": accuracy_score(y_test, best_y_hat),
                    "best_precision": precision_score(y_test, best_y_hat),
                    "best_recall": recall_score(y_test, best_y_hat),
                    "best_f1_score": f1_score(y_test, best_y_hat),
                }
            )

            input_example = (
                X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5]
            )

            mlflow.sklearn.log_model(
                best_model, "best_model", input_example=input_example
            )

        except Exception as e:
            logging.error(f"Unexpected error!: {e}", exc_info=True)
            mlflow.log_param("error", str(e))

        t1 = time.time()
        logging.info(f"Execution time : {t1 - t0:.2f} sec")
    logging.info("MLFlow run execution complete\n")


In [12]:
run_experiments(df)
clear_output()