In [1]:
import ast
import os

import pandas as pd

our_dataset_path = '.'

posts_path = os.path.join(our_dataset_path, 'Data\\posts.csv')
fact_checks_path = os.path.join(our_dataset_path, 'Data\\fact_checks.csv')
fact_check_post_mapping_path = os.path.join(our_dataset_path, 'Data\\pairs.csv')

for path in [posts_path, fact_checks_path, fact_check_post_mapping_path]:
    print(f"Checking path: {path}")
    assert os.path.isfile(path), f"File not found: {path}"


Checking path: .\Data\posts.csv
Checking path: .\Data\fact_checks.csv
Checking path: .\Data\pairs.csv


In [2]:
parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s

df_fact_checks = pd.read_csv(fact_checks_path).fillna('').set_index('fact_check_id')
for col in ['claim', 'instances', 'title']:
    df_fact_checks[col] = df_fact_checks[col].apply(parse_col)


df_posts = pd.read_csv(posts_path).fillna('').set_index('post_id')
for col in ['instances', 'ocr', 'verdicts', 'text']:
    df_posts[col] = df_posts[col].apply(parse_col)


df_fact_check_post_mapping = pd.read_csv(fact_check_post_mapping_path) 

In [7]:
import csv

# Der Pfad zur CSV-Datei
csv_datei = posts_path

# Öffnen der CSV-Datei und auslesen der 'verdicts' Spalte
with open(csv_datei, newline='', encoding='utf-8') as csvfile:
    # CSV-Reader erstellen
    reader = csv.DictReader(csvfile)
    
    # Set für einzigartige verdicts-Werte
    verdicts_set = set()

    # Iteriere durch jede Zeile in der CSV-Datei
    for row in reader:
        # Füge den Wert der 'verdicts'-Spalte zum Set hinzu
        verdicts_set.add(row['verdicts'])

    # Ausgabe der einzigartigen 'verdicts'-Werte
    print("Mögliche 'verdicts'-Werte:")
    for verdict in sorted(verdicts_set):
        print(verdict)


Mögliche 'verdicts'-Werte:
[' Sensitive content ']
['Altered Photo/Video']
['Altered photo']
['Altered photo/video']
['Altered photo/video.']
['Altered video']
['False headline']
['False information and graphic content']
['False information', 'False information and graphic content']
['False information', 'False information.']
['False information', 'Missing Context']
['False information', 'Missing context']
['False information', 'Partly False']
['False information', 'Partly false information']
['False information', 'Partly false information.']
['False information']
['False information.', 'Partly false information']
['False information.']
['False', 'False information']
['False', 'Partly false information']
['False']
['Missing Context', 'Missing context']
['Missing Context']
['Missing context', 'Missing context.']
['Missing context', 'Partly false information']
['Missing context']
['Missing context.']
['Partly False', 'Partly false information']
['Partly False']
['Partly false information

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Hyperparameter
batch_size = 64
num_epochs = 10
learning_rate = 0.001
input_size = 300  # Größe des Word Embeddings (z.B. GloVe 300d)
hidden_size = 512  # Größe der versteckten Schicht
max_length = 50  # Maximale Länge der Eingabesequenz (Nummer der Wörter)

# Beispiel für die Word Embedding Initialisierung (z.B. GloVe)
embedding_matrix = np.random.rand(10000, input_size)  # 10.000 Vokabulargröße, Embedding-Größe 300

class FactCheckDataset(Dataset):
    def __init__(self, posts, claims, labels, tokenizer, max_length, embedding_matrix):
        self.posts = posts
        self.claims = claims
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.embedding_matrix = embedding_matrix

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, idx):
        post = self.posts[idx]
        claim = self.claims[idx]
        label = self.labels[idx]
        
        # Tokenisierung und Indexierung der Wörter
        post_indices = self.tokenizer(post)
        claim_indices = self.tokenizer(claim)
        
        # Padding und Truncation
        post_indices = post_indices[:self.max_length] + [0] * (self.max_length - len(post_indices))
        claim_indices = claim_indices[:self.max_length] + [0] * (self.max_length - len(claim_indices))
        
        # Konvertiere Indizes zu Embeddings
        post_embedding = torch.tensor([self.embedding_matrix[idx] for idx in post_indices], dtype=torch.float32)
        claim_embedding = torch.tensor([self.embedding_matrix[idx] for idx in claim_indices], dtype=torch.float32)
        
        # Kombiniere Post und Claim-Embeddings
        combined_embedding = torch.cat((post_embedding, claim_embedding), dim=0)  # Verkettung der beiden

        return {
            'embedding': combined_embedding,
            'label': torch.tensor(label, dtype=torch.long)
        }

class FactCheckModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(FactCheckModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # Fully connected layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  # Output layer (2 Klassen: relevant, nicht relevant)

    def forward(self, x):
        x = self.fc1(x)  # Erste Schicht
        x = self.relu(x)  # ReLU Aktivierung
        x = self.fc2(x)  # Ausgabeschicht
        return x

# Simulierter Tokenizer: wandelt Wörter in Indizes um (in der Realität verwendest du einen echten Tokenizer)
def simple_tokenizer(text):
    # Dummy-Tokenizer: Splitte Text in Wörter und ordne jedem Wort eine ID zu
    word_to_index = {"post": 1, "claim": 2, "fact": 3, "relevant": 4, "irrelevant": 5}  # Beispielwortschatz
    return [word_to_index.get(word, 0) for word in text.lower().split()]

# Beispiel-Daten
posts = ["post text 1", "post text 2"]
claims = ["claim text 1", "claim text 2"]
labels = [1, 0]  # 1: relevant, 0: nicht relevant

# Dataset erstellen
dataset = FactCheckDataset(posts, claims, labels, simple_tokenizer, max_length, embedding_matrix)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Modell initialisieren
model = FactCheckModel(input_size=2*max_length*input_size, hidden_size=hidden_size, num_classes=2)

# Optimierer und Verlustfunktion
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        
        embeddings = batch['embedding']
        labels = batch['label']
        
        # Forward Pass
        outputs = model(embeddings)
        
        # Verlust berechnen
        loss = criterion(outputs, labels)
        
        # Backward Pass und Optimierung
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

# Funktion zur Berechnung der Top 10 relevantesten Behauptungen für einen gegebenen Post
def get_top_k_relevant_claims(post, claims, k=10):
    # Tokenisiere den Post
    post_indices = simple_tokenizer(post)
    post_embedding = torch.tensor([embedding_matrix[idx] for idx in post_indices], dtype=torch.float32)
    
    # Berechne die Ähnlichkeit zwischen dem Post und jeder Behauptung
    similarities = []
    for claim in claims:
        # Tokenisiere die Behauptung
        claim_indices = simple_tokenizer(claim)
        claim_embedding = torch.tensor([embedding_matrix[idx] for idx in claim_indices], dtype=torch.float32)
        
        # Berechne die Ähnlichkeit (z.B. Cosine Similarity)
        similarity = cosine_similarity(post_embedding.unsqueeze(0), claim_embedding.unsqueeze(0))
        similarities.append(similarity[0][0])
    
    # Sortiere die Behauptungen nach der Ähnlichkeit und gib die Top-k zurück
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Top-k Indizes, absteigend sortiert
    return [(claims[idx], similarities[idx]) for idx in top_k_indices]

# Beispiel: Top 10 relevante Behauptungen für einen Post
post_example = "post text 1"
top_10_claims = get_top_k_relevant_claims(post_example, claims, k=2)

print("Top 10 relevante Behauptungen:")
for claim, similarity in top_10_claims:
    print(f"Behauptung: {claim} - Ähnlichkeit: {similarity:.4f}")
