In [1]:
import pandas
import sklearn.metrics

import cltrier_lib

In [2]:
DATA_FILE: str = "../data/processed/DefaktS_Twitter.binary.csv"
N_SAMPLES: int = 500

In [3]:
dataset: pandas.DataFrame = (
    pandas.read_csv(DATA_FILE, index_col=[0])
    .replace(dict(binary_label={0.0: "neutral_post", 1.0: "possible_fake_news"}))
    .sample(n=N_SAMPLES)
)
dataset.head()

Unnamed: 0_level_0,text,binary_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
391568,»Es kann nicht die Aufgabe eines Politikers se...,neutral_post
385422,#Schönbohm wehrt sich gegen Arbeitsverbot http...,neutral_post
392193,Erster Gesetzesvorschlag der AfD wird umgesetz...,possible_fake_news
385675,Niedersachsen droht Impfgegnern mit Bußgeld un...,neutral_post
389160,Seit 2015 immer wieder Angriffe mitten in D! D...,possible_fake_news


In [4]:
dataset["binary_label"].value_counts()

binary_label
neutral_post          309
possible_fake_news    191
Name: count, dtype: int64

In [5]:
list(dataset.itertuples())

[Pandas(Index=391568, text='»Es kann nicht die Aufgabe eines Politikers sein, die öffentliche Meinung abzuklopfen und dann das Populäre zu tun. Aufgabe des Politikers ist es, das Richtige zu tun und es populär zu machen.« Das sagte einst Bundespräsident Walter Scheel.', binary_label='neutral_post'),
 Pandas(Index=385422, text='#Schönbohm wehrt sich gegen Arbeitsverbot https://t.co/yVSljB0SP4', binary_label='neutral_post'),
 Pandas(Index=392193, text='Erster Gesetzesvorschlag der AfD wird umgesetzt - Bundestag wird der AfD zustimmen. https://t.co/LYMIoCQVSq via @YouTube. Natürlich nicht direkt. Wurde nachweislich ( siehe Video) 1:1 abgeschrieben und als eigener  Vorschlag der Ampel verkauft… immerhin: AfD wirkt 💜💜💜😎', binary_label='possible_fake_news'),
 Pandas(Index=385675, text='Niedersachsen droht Impfgegnern mit Bußgeld und Arbeitsverbot https://t.co/JSSHxaCF0G', binary_label='neutral_post'),
 Pandas(Index=389160, text='Seit 2015 immer wieder Angriffe mitten in D! Die #Willkommensku

In [6]:
inference = cltrier_lib.inference.Pipeline(nmodel="llama3.1:70b-instruct-q6_K")

instruction = cltrier_lib.inference.schemas.Chat(messages=[
    cltrier_lib.inference.schemas.Message(
        role="system",
        content=\
"""You are a specialized content analyzer focused on identifying potential misinformation in social media posts. Your task is to classify tweets into two categories: neutral_post or possible_fake_news.

Fake News Characteristics: 

- Disinformation exhibits a higher degree of contentual inconsistencies like semantic contradictions or logic errors throughout the text.
- The body of unreliable articles adds relatively little new information, but serves to repeat and enhance the claims made at the beginning.
- Unreliable articles frequently narrate in terms of a clear friend-foe-distinction with regard to specific national, ethical, or religious groups or elites as foes or perpetrators. The opposing group (often framed in a common "we", "ourselves", "the government") takes the part of the victim who needs to be protected. 
- Unreliable sources incline to use a more emotionally persuasive language and touch more often sensible subjects (like children, death and burial).
- Fake articles tend to be written in a hyperbolic way to attract the reader's attention, i.e. with a high usage of all-caps-words, exclamation marks or a general sentiment wording.
- Legitimate sources tend to report about past events whereas fake articles focus on highly recent topics.
- Fake articles use a higher amount of hedging words (like 'possibly', 'usually', 'tend to be') to achieve a more indirect form of expression. Also they evoke a feeling of uncertainty by addressing the vagueness of information directly. 
- Content that calls on supposedly scientific research or reputable institutions without identifying concrete sources or by manipulating them to create a false theory.
- Stories that lack any factual ground or manipulated information or image. The intention is to deceive and cause harm. Could be text or visual media. 
- Real information is being presented in a false context. The recipient is aware that the information is true, but he does not realize that the context has been changed.
- Stories without factual basis which usually explain important events as secret plots by government or powerful individuals. By definition their truthfulness is difficult to verify. Evidence refuting the conspiracy is regarded as further proof of the conspiracy.
- Information that is created by a political entity to influence public opinion and gain support for a public figure, organization or government.
- Posts that are pure opinion, comics, satire, or any other posts that do not make a factual claim. This is also the category to use for posts that are of the "Like this if you think..." variety.

If the tweets exhibits one of the above described charactericts classify it possible_fake_news else classify it as neutral_post. Respond only with the classname. Omit justifications.

Examples:

Ich bin für ein Arbeitsverbot für Ungeimpfte und Impfgegner
neutral_post

Pädophile Regierung #IRGCterrorists
possible_fake_news

Hab noch schnell die Proben für den Giardientest zur Tierarztpraxis gebracht und wenn heute keine Katastrophen mehr passieren, ist für den Rest des Tages Ruhe angesagt
neutral_post

In Belgien hat nur die „#Partyszene“ wieder etwas gefeiert. \n\nKein Grund zur Aufregung!\n\nMachen Deutsche auch!\n\nHamburg #G7 und so.
possible_fake_news
"""
    )
])

In [10]:
labels = []
preds = []

for id, sample in dataset.iterrows():
    response = inference(
        instruction.add_message(
            cltrier_lib.inference.schemas.Message(
                role="user",
                content=sample["text"]
            )
        )
    )[-1].content

    if response not in ["neutral_post", "possible_fake_news"]:
        response = "neutral_post"

    labels.append(sample["binary_label"])
    preds.append(response)

In [11]:
print(sklearn.metrics.classification_report(labels, preds, zero_division=0.0))

                    precision    recall  f1-score   support

      neutral_post       0.70      0.61      0.66       309
possible_fake_news       0.48      0.58      0.52       191

          accuracy                           0.60       500
         macro avg       0.59      0.60      0.59       500
      weighted avg       0.62      0.60      0.60       500

