In [None]:
import typing

import pandas
import numpy
import sentence_transformers

import rich.progress

  from tqdm.autonotebook import tqdm, trange


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

In [3]:
DATA_FILE: str = "../data/processed/DefaktS_Twitter.binary.csv"
TEST_FRAC: float = 0.05

In [None]:
DATA: pandas.DataFrame = (
    pandas.read_csv(DATA_FILE, index_col=[0])
    .replace(dict(binary_label={0.0: "neutral_post", 1.0: "possible_fake_news"}))
    .rename(columns={"binary_label": "label"})
)
DATA.head()

In [None]:
DATA["label"].value_counts()

In [None]:
DATA_TRAIN = DATA.sample(frac=1.0 - TEST_FRAC)
DATA_TEST = DATA.loc[DATA.index.difference(DATA_TRAIN.index)]

len(DATA_TRAIN), len(DATA_TEST)

In [None]:
model = sentence_transformers.SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", device="cuda")

def remote_encoding(data: typing.List[str]) -> typing.List:

    def batched(iterable, n=64):
        l = len(iterable)

        return [
            iterable[ndx:min(ndx + n, l)]
            for ndx in range(0, l, n)
        ]

    embeds: typing.List[numpy.ndarray] = []

    for batch in rich.progress.track(batched(data)):
    
        try: 
            embed = model.encode(batch)
            
        except Exception as _e:
            display(_e)
            embed = None
        
        embeds.extend(embed)

    return embeds

In [8]:
ENCODERS: typing.Dict[str, typing.Dict[str, typing.Callable]] = {
    "transformer_embeds": {
        "embed_train": lambda x: remote_encoding(x),
        "embed_test": lambda x: remote_encoding(x),
    },
    "tfidf": {
        "engine": (tfidf := TfidfVectorizer()),
        "embed_train": lambda x: tfidf.fit_transform(x),
        "embed_test": lambda x: tfidf.transform(x),
    }
}

In [9]:
CLASSIFIERS: typing.Dict[str, typing.Callable] = {
    "svc": LinearSVC,
    "random_forest": RandomForestClassifier,
    "ada_boost": AdaBoostClassifier,
    "decision_tree": DecisionTreeClassifier,
    "k_neighbors": KNeighborsClassifier,
    "mlp": MLPClassifier,
}

In [None]:
results: typing.List[pandas.DataFrame] = []

for encoder_label, encoder in ENCODERS.items():

    embed_train = encoder["embed_train"](DATA_TRAIN["text"].tolist())
    embed_test = encoder["embed_test"](DATA_TEST["text"].tolist())

    results.append(
        pandas.json_normalize(
            data=[
                classification_report(
                    DATA_TEST["label"].tolist(),
                    (
                        classifier()
                        .fit(
                                embed_train, 
                                DATA_TRAIN["label"].tolist()
                            )
                        .predict(embed_test)
                    ),
                    zero_division=1.,
                    output_dict=True
                ) | {"classifier": classifier_label, "encoder": encoder_label}
                for classifier_label, classifier in CLASSIFIERS.items()
            ]
        )
        .set_index(["encoder", "classifier"], drop=True)
        .filter(
            items=[
                "accuracy",
                "macro avg.f1-score",
                "weighted avg.f1-score"
            ]
        )
        .sort_values(by="accuracy", ascending=False)
    )

In [None]:
pandas.concat(results).sort_values(by="weighted avg.f1-score", ascending=False)