In [1]:
import typing

import numpy as np
import pandas as pd

from datasets import load_dataset

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

import cltrier_nlp as nlp

In [2]:
SAMPLE_SIZE: int = 1_000
ENCODER_TYPE: typing.Literal["tfidf", "transformer"] = "tfidf"

In [3]:
DATASET = load_dataset("stanfordnlp/imdb").shuffle()

In [4]:
ENCODERS: typing.Dict[str, typing.Dict[str, typing.Callable]] = {
    "tfidf": {
        "engine": (tfidf := TfidfVectorizer()),
        "embed_train": lambda x: tfidf.fit_transform(x),
        "embed_test": lambda x: tfidf.transform(x),
    },
    "transformer": {
        "engine": (transformer := nlp.encoder.Encoder()),
        "embed_train": lambda x: np.stack(
            [embed.detach().numpy() for embed in nlp.encoder.EncoderPooler(transformer(x), form="sent_cls")]),
        "embed_test": lambda x: np.stack(
            [embed.detach().numpy() for embed in nlp.encoder.EncoderPooler(transformer(x), form="sent_cls")]),
    }
}



In [5]:
CLASSIFIERS: typing.Dict[str, typing.Callable] = {
    "random_forest": RandomForestClassifier,
    "ada_boost": AdaBoostClassifier,
    "decision_tree": DecisionTreeClassifier,
    "k_neighbors": KNeighborsClassifier,
    "mlp": MLPClassifier,
}

In [6]:
X_train_embed = ENCODERS[ENCODER_TYPE]["embed_train"](DATASET["train"][:SAMPLE_SIZE]["text"])
X_test_embed = ENCODERS[ENCODER_TYPE]["embed_test"](DATASET["test"][:SAMPLE_SIZE]["text"])

In [9]:
(
    pd.json_normalize(
        data=[
            classification_report(
                DATASET["test"]["label"][:SAMPLE_SIZE],
                (
                    cls()
                    .fit(X_train_embed, DATASET["train"]["label"][:SAMPLE_SIZE])
                    .predict(X_test_embed)
                ),
                zero_division=1.,
                output_dict=True
            ) | {"classifier": label, "encoder": ENCODER_TYPE}
            for label, cls in CLASSIFIERS.items()
        ]
    )
    .set_index("classifier", drop=True)
    .filter(
        items=[
            "accuracy",
            "macro avg.f1-score",
            "weighted avg.f1-score"
        ]
    )
    .sort_values(by="accuracy", ascending=False)
)



Unnamed: 0_level_0,accuracy,macro avg.f1-score,weighted avg.f1-score
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mlp,0.844,0.843774,0.843869
ada_boost,0.77,0.769594,0.769748
random_forest,0.755,0.754024,0.753776
decision_tree,0.66,0.659864,0.659755
k_neighbors,0.653,0.652708,0.652869
