In [1]:
import os
import re
import nltk
import time
import scipy
import dotenv
import mlflow
import string
import dagshub
import logging
import warnings
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from IPython.display import clear_output
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.getLogger().setLevel(logging.INFO)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")


In [3]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
for keep in ("not", "no", "nor", "n't"):
    stop_words.discard(keep)
lemmatizer = WordNetLemmatizer()


In [4]:
df = pd.read_csv("imdb.csv")


In [5]:
def remove_html(text):
    return re.sub(r"<.*?>", " ", text)


def remove_urls(text):
    return re.sub(r"http\S+|www\S+", " ", text)


def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def tokenize(text):
    return word_tokenize(text=text)


def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = remove_html(text=text)
    text = remove_urls(text=text)
    text = remove_punctuations(text=text)
    tokens = tokenize(text=text)
    tokens = remove_stopwords(tokens=tokens)
    tokens = lemmatize_tokens(tokens=tokens)
    return " ".join(tokens)


In [7]:
df["review"] = df["review"].apply(preprocess_text)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})

In [8]:
dotenv.load_dotenv()

dagshub_uri = str(os.getenv("DAGSHUB_URI"))
dagshub_repo = os.getenv("DAGSHUB_REPO")
dagshub_username = os.getenv("DAGSHUB_USERNAME")


In [9]:
mlflow.set_tracking_uri(dagshub_uri)
dagshub.init(
    repo_name=dagshub_repo,
    repo_owner=dagshub_username,
    mlflow=True,
)
mlflow.set_experiment("CoSim Hyperparameter Tunning")
clear_output()

In [10]:
class CosineCentroid(BaseEstimator, ClassifierMixin):
    def __init__(self, shrink_threshold=None):
        self.shrink_threshold = shrink_threshold

    def fit(self, X, y):
        y = np.array(y)
        self.classes_ = np.unique(y)
        centroids = []

        for c in self.classes_:
            Xc = X[y == c]
            centroid = Xc.mean(axis=0)

            # Sparse → dense 1D vector
            if hasattr(centroid, "toarray"):
                centroid = centroid.toarray().ravel()
            else:
                centroid = np.asarray(centroid).ravel()

            # shrinkage (soft threshold)
            if self.shrink_threshold is not None:
                t = float(self.shrink_threshold)
                centroid = np.sign(centroid) * np.maximum(np.abs(centroid) - t, 0.0)

            # L2 normalize centroid
            norm = np.linalg.norm(centroid)
            if norm > 0:
                centroid = centroid / (norm + 1e-12)

            centroids.append(centroid)

        self.centroids_ = np.vstack(centroids)
        return self

    def predict(self, X):
        # Sparse → dense
        if hasattr(X, "toarray"):
            X = X.toarray()

        # normalize samples
        X_norm = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)

        # compute cosine similarity
        sim = cosine_similarity(X_norm, self.centroids_)
        idx = np.argmax(sim, axis=1)
        return self.classes_[idx]

In [11]:
test_size = 0.2
max_iter = 1000
max_features = 1000

vectorizer = TfidfVectorizer(max_features=max_features)
X = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

In [12]:
def run_experiments(df):
    clf = CosineCentroid()
    param_grid = {
        "shrink_threshold": [None, 0.0, 0.001, 0.005, 0.01, 0.02, 0.05,
                             0.1, 0.2, 0.3, 0.5, 0.7, 1.0,
                             2.0, 3.0, 5.0, 10.0]
    }

    logging.info("Starting MLFlow run...")
    with mlflow.start_run(run_name="All Experiments"):
        t0 = time.time()
        try:
            grid_search = GridSearchCV(
                estimator=clf,
                param_grid=param_grid,
                cv=5,
                scoring="accuracy",
                n_jobs=-1,
            )

            grid_search.fit(X_train, y_train)

            for params, mean_score, std_score in zip(
                grid_search.cv_results_["params"],
                grid_search.cv_results_["mean_test_score"],
                grid_search.cv_results_["std_test_score"],
            ):
                run_name = f"ShrinkThreshold={params.get('shrink_threshold', 'N/A')}"

                with mlflow.start_run(run_name=run_name, nested=True):
                    model = CosineCentroid(**params)
                    model.fit(X_train, y_train)

                    y_hat = model.predict(X_test)

                    mlflow.log_params(params)
                    mlflow.log_metrics(
                        {
                            "accuracy": accuracy_score(y_test, y_hat),
                            "precision": precision_score(y_test, y_hat),
                            "recall": recall_score(y_test, y_hat),
                            "f1_score": f1_score(y_test, y_hat),
                            "mean_cv_score": mean_score,
                            "std_cv_score": std_score,
                        }
                    )

                    input_example = (
                        X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5]
                    )

                    mlflow.sklearn.log_model(
                        model, "model", input_example=input_example
                    )

            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_
            best_score = grid_search.best_score_

            best_y_hat = best_model.predict(X_test)

            mlflow.log_params(best_params)
            mlflow.log_metric("best_cv_f1_score", best_score)

            mlflow.log_metrics(
                {
                    "best_accuracy": accuracy_score(y_test, best_y_hat),
                    "best_precision": precision_score(y_test, best_y_hat),
                    "best_recall": recall_score(y_test, best_y_hat),
                    "best_f1_score": f1_score(y_test, best_y_hat),
                }
            )

            input_example = (
                X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5]
            )

            mlflow.sklearn.log_model(
                best_model, "best_model", input_example=input_example
            )

        except Exception as e:
            logging.error(f"Unexpected error!: {e}", exc_info=True)
            mlflow.log_param("error", str(e))

        t1 = time.time()
        logging.info(f"Execution time : {t1 - t0:.2f} sec")
    logging.info("MLFlow run execution complete\n")


In [None]:
run_experiments(df)
clear_output()