In [32]:
import typing

import pandas
import numpy
import sentence_transformers

import rich.progress

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

In [34]:
DATA_FILE: str = "../data/processed/DefaktS_Twitter.binary.csv"
TEST_FRAC: float = 0.05

In [35]:
DATA: pandas.DataFrame = (
    pandas.read_csv(DATA_FILE, index_col=[0])
    .replace(dict(binary_label={0.0: "neutral_post", 1.0: "possible_fake_news"}))
    .rename(columns={"binary_label": "label"})

    # remove urls
    .pipe(lambda _df: _df.assign(text=(
        _df["text"].str
        # replace urls with special token
        .replace(r"https?://\S+|www\.\S+", "[URL]")
    )))

    # downsample to smallest category
    .pipe(lambda _df: (
        _df
        .groupby("label")
        .sample(n=min(_df["label"].value_counts()))
    ))
)
DATA.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
406778,Gut!\n\nDr. Gut: «Elon Musk und die Feinde der...,neutral_post
387950,#Brennholz kann in #Bayern viel an fossilen #E...,neutral_post
427946,Es besteht keine Notwendigkeit für Ihre Expert...,neutral_post
391007,BUNDESPRÄSIDENT EIN BRÜLLER😉? https://t.co/GAc...,neutral_post
429117,"Die Chancen stehen gut, dass #Bitcoin selbst d...",neutral_post


In [36]:
DATA["label"].value_counts()

label
neutral_post          8225
possible_fake_news    8225
Name: count, dtype: int64

In [37]:
DATA_TRAIN = DATA.sample(frac=1.0 - TEST_FRAC)
DATA_TEST = DATA.loc[DATA.index.difference(DATA_TRAIN.index)]

len(DATA_TRAIN), len(DATA_TEST)

(15628, 822)

In [38]:
model = sentence_transformers.SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", device="cuda")

def embed(data: typing.List[str]) -> typing.List:

    def batched(iterable, n=128):
        l = len(iterable)

        return [
            iterable[ndx:min(ndx + n, l)]
            for ndx in range(0, l, n)
        ]

    embeds: typing.List[numpy.ndarray] = []

    for batch in rich.progress.track(batched(data)):
    
        try: 
            embed = model.encode(batch)
            
        except Exception as _e:
            display(_e)
            embed = None
        
        embeds.extend(embed)

    return embeds

In [39]:
ENCODERS: typing.Dict[str, typing.Dict[str, typing.Callable]] = {
    "tfidf": {
        "engine": (tfidf := TfidfVectorizer()),
        "embed_train": lambda x: tfidf.fit_transform(x),
        "embed_test": lambda x: tfidf.transform(x),
    },
    "transformer_embeds": {
        "embed_train": lambda x: embed(x),
        "embed_test": lambda x: embed(x),
    },
}

In [40]:
CLASSIFIERS: typing.Dict[str, typing.Callable] = {
    # "svc": LinearSVC,
    # "random_forest": RandomForestClassifier,
    # "ada_boost": AdaBoostClassifier,
    "decision_tree": DecisionTreeClassifier,
    "k_neighbors": KNeighborsClassifier,
    # "mlp": MLPClassifier,
}

In [41]:
results: typing.List[pandas.DataFrame] = []

for encoder_label, encoder in ENCODERS.items():

    embed_train = encoder["embed_train"](DATA_TRAIN["text"].tolist())
    embed_test = encoder["embed_test"](DATA_TEST["text"].tolist())

    results.append(
        pandas.json_normalize(
            data=[
                classification_report(
                    DATA_TEST["label"].tolist(),
                    (
                        classifier()
                        .fit(
                                embed_train, 
                                DATA_TRAIN["label"].tolist()
                            )
                        .predict(embed_test)
                    ),
                    zero_division=1.,
                    output_dict=True
                ) | {"classifier": classifier_label, "encoder": encoder_label}
                for classifier_label, classifier in CLASSIFIERS.items()
            ]
        )
        .set_index(["encoder", "classifier"], drop=True)
        .filter(
            items=[
                "accuracy",
                "macro avg.f1-score",
                "weighted avg.f1-score"
            ]
        )
        .sort_values(by="accuracy", ascending=False)
    )

Output()

Output()

In [42]:
pandas.concat(results).sort_values(by="weighted avg.f1-score", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,macro avg.f1-score,weighted avg.f1-score
encoder,classifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tfidf,decision_tree,0.779805,0.779083,0.77939
transformer_embeds,k_neighbors,0.774939,0.774696,0.774876
tfidf,k_neighbors,0.739659,0.739399,0.739599
transformer_embeds,decision_tree,0.641119,0.641076,0.641172
