In [None]:
import typing

import numpy as np
import pandas as pd

from datasets import load_dataset

import tqdm 
import requests

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

import cltrier_nlp as nlp

In [None]:
SAMPLE_SIZE: int = 2_500

In [None]:
DATASET = load_dataset("stanfordnlp/imdb").shuffle()

In [None]:
def remote_encoding(batch: typing.List[str]) -> typing.List:
    embeds: typing.List[np.ndarray] = []

    for value in tqdm.tqdm(batch):
    
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'prompt': value}
            ).json()["response"])
            
        except Exception as _e:
            display(_e)
            embed = None
        
        embeds.append(embed)

    return embeds

In [None]:
ENCODERS: typing.Dict[str, typing.Dict[str, typing.Callable]] = {
    "tfidf": {
        "engine": (tfidf := TfidfVectorizer()),
        "embed_train": lambda x: tfidf.fit_transform(x),
        "embed_test": lambda x: tfidf.transform(x),
    },
    "tiny transformer (local)": {
        "engine": (transformer := nlp.encoder.Encoder()),
        "embed_train": lambda x: np.stack(
            [embed.detach().numpy() for embed in nlp.encoder.EncoderPooler()(transformer(x), form="sent_cls")]),
        "embed_test": lambda x: np.stack(
            [embed.detach().numpy() for embed in nlp.encoder.EncoderPooler()(transformer(x), form="sent_cls")]),
    },
    "sota transformer (remote)": {
        "embed_train": lambda x: remote_encoding(x),
        "embed_test": lambda x: remote_encoding(x),
    }
}

In [None]:
CLASSIFIERS: typing.Dict[str, typing.Callable] = {
    "random_forest": RandomForestClassifier,
    "ada_boost": AdaBoostClassifier,
    "decision_tree": DecisionTreeClassifier,
    "k_neighbors": KNeighborsClassifier,
    "mlp": MLPClassifier,
}

In [None]:
results: typing.List[pd.DataFrame] = []

for encoder_label, encoder in ENCODERS.items():

    embed_train = encoder["embed_train"](DATASET["train"][:SAMPLE_SIZE]["text"])
    embed_test = encoder["embed_test"](DATASET["test"][:SAMPLE_SIZE]["text"])

    results.append(
        pd.json_normalize(
            data=[
                classification_report(
                    DATASET["test"]["label"][:SAMPLE_SIZE],
                    (
                        classifier()
                        .fit(
                                embed_train, 
                                DATASET["train"]["label"][:SAMPLE_SIZE]
                            )
                        .predict(embed_test)
                    ),
                    zero_division=1.,
                    output_dict=True
                ) | {"classifier": classifier_label, "encoder": encoder_label}
                for classifier_label, classifier in CLASSIFIERS.items()
            ]
        )
        .set_index(["encoder", "classifier"], drop=True)
        .filter(
            items=[
                "accuracy",
                "macro avg.f1-score",
                "weighted avg.f1-score"
            ]
        )
        .sort_values(by="accuracy", ascending=False)
    )

In [None]:
pd.concat(results).sort_values(by="weighted avg.f1-score", ascending=False)