In [1]:
import os
import re
import nltk
import time
import scipy
import dotenv
import mlflow
import string
import dagshub
import logging
import warnings
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.getLogger().setLevel(logging.INFO)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")


In [3]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


In [5]:
df = pd.read_csv("imdb.csv")
df


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
def remove_html(text):
    return re.sub(r"<.*?>", " ", text)


def remove_urls(text):
    return re.sub(r"http\S+|www\S+", " ", text)


def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def tokenize(text):
    return word_tokenize(text=text)


def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]


In [7]:
def preprocess_text(text):
    text = text.lower()
    text = remove_html(text=text)
    text = remove_urls(text=text)
    text = remove_punctuations(text=text)
    tokens = tokenize(text=text)
    tokens = remove_stopwords(tokens=tokens)
    tokens = lemmatize_tokens(tokens=tokens)
    return " ".join(tokens)


In [8]:
df["review"] = df["review"].apply(preprocess_text)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})
df

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1
...,...,...
49995,thought movie right good job wasnt creative or...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary school nu...,0
49998,im going disagree previous comment side maltin...,0


In [9]:
dotenv.load_dotenv()

dagshub_uri = str(os.getenv("DAGSHUB_URI"))
dagshub_repo = os.getenv("DAGSHUB_REPO")
dagshub_username = os.getenv("DAGSHUB_USERNAME")


In [10]:
mlflow.set_tracking_uri(dagshub_uri)
dagshub.init(
    repo_name=dagshub_repo,
    repo_owner=dagshub_username,
    mlflow=True,
)
mlflow.set_experiment("LoR Hyperparameter Tunning")
clear_output()

In [20]:
test_size = 0.2
max_iter = 20000

vectorizer = TfidfVectorizer(max_features=35000)
X = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

In [21]:
def run_experiments(df):
    param_grid = {
        "C": [0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear"],
        "multi_class": ["ovr"],
    }

    logging.info("Starting MLFlow run...")
    with mlflow.start_run(run_name="All Experiments"):
        t0 = time.time()
        try:
            grid_search = GridSearchCV(
                estimator=LogisticRegression(max_iter=max_iter),
                param_grid=param_grid,
                cv=5,
                scoring="f1",
                n_jobs=-1,
            )

            grid_search.fit(X_train, y_train)

            for params, mean_score, std_score in zip(
                grid_search.cv_results_["params"],
                grid_search.cv_results_["mean_test_score"],
                grid_search.cv_results_["std_test_score"],
            ):
                mc_val = params.get("multi_class", "N/A")
                run_name = f"C={params.get('C', 'N/A')} Solver={params.get('solver', 'N/A')} MC={mc_val}"

                with mlflow.start_run(run_name=run_name, nested=True):
                    model = LogisticRegression(max_iter=max_iter, **params)
                    model.fit(X_train, y_train)

                    y_hat = model.predict(X_test)

                    mlflow.log_params(params)
                    mlflow.log_param("max_iter", max_iter)
                    mlflow.log_metrics(
                        {
                            "accuracy": accuracy_score(y_test, y_hat),
                            "precision": precision_score(y_test, y_hat),
                            "recall": recall_score(y_test, y_hat),
                            "f1_score": f1_score(y_test, y_hat),
                            "mean_cv_score": mean_score,
                            "std_cv_score": std_score,
                        }
                    )

                    input_example = (
                        X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5]
                    )

                    mlflow.sklearn.log_model(
                        model, "model", input_example=input_example
                    )

            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_
            best_score = grid_search.best_score_

            best_y_hat = best_model.predict(X_test)

            mlflow.log_params(best_params)
            mlflow.log_metric("best_cv_f1_score", best_score)

            mlflow.log_metrics(
                {
                    "best_accuracy": accuracy_score(y_test, best_y_hat),
                    "best_precision": precision_score(y_test, best_y_hat),
                    "best_recall": recall_score(y_test, best_y_hat),
                    "best_f1_score": f1_score(y_test, best_y_hat),
                }
            )

            input_example = (
                X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5]
            )

            mlflow.sklearn.log_model(
                best_model, "best_model", input_example=input_example
            )

        except Exception as e:
            logging.error(f"Unexpected error!: {e}", exc_info=True)
            mlflow.log_param("error", str(e))

        t1 = time.time()
        logging.info(f"Execution time : {t1 - t0:.2f} sec")
    logging.info("MLFlow run execution complete\n")


In [22]:
run_experiments(df)
clear_output()