In [27]:
import os
import pandas as pd

def load_datasets(basepath):
    """
    Load and clean individual fake news datasets from subfolders in basepath.
    Returns a dict of pandas DataFrames keyed by dataset name.
    """
    datasets = {}

    # 1. Aadya Singh_Fakenews: evaluation.csv, test (1).csv, train (2).csv
    folder = os.path.join(basepath, "Aadya Singh  _fake-and_real_news")
    files = ["evaluation.csv", "test (1).csv", "train (2).csv"]
    dfs = [pd.read_csv(os.path.join(folder, f), sep=';') for f in files]
    datasets['Aadya_Singh'] = pd.concat(dfs, ignore_index=True)

    # 2. Bhavik Jikadara - Fake News Detection
    folder = os.path.join(basepath, "Bhavik Jikadara - Fake News Detection")
    fake = pd.read_csv(os.path.join(folder, "fake.csv"))
    true = pd.read_csv(os.path.join(folder, "true.csv"))
    datasets['Bhavik_Jikadara'] = pd.concat([fake, true], ignore_index=True)

    # 3. clmentbisaillon_Fakenews
    folder = os.path.join(basepath, "clmentbisaillon_Fakenews")
    true = pd.read_csv(os.path.join(folder, "True.csv"))
    fake = pd.read_csv(os.path.join(folder, "Fake.csv"))
    df3 = pd.concat([true, fake], ignore_index=True)
    if 'text' in df3.columns:
        df3 = df3[df3['text'] != "[empty]"]
    datasets['clmentbisaillon'] = df3

    # 4. ErfanMoosaviMonazzah - fake-news-detection-dataset-English
    folder = os.path.join(basepath, "ErfanMoosaviMonazzah - fake-news-detection-dataset-English")
    parts = ["test.tsv", "train.tsv", "validation.tsv"]
    dfs = [pd.read_csv(os.path.join(folder, p), sep='\t') for p in parts]
    datasets['ErfanMoosaviMonazzah'] = pd.concat(dfs, ignore_index=True)

    # 5. GonzaloA - fake_news
    folder = os.path.join(basepath, "GonzaloA - fake_news")
    parts = ["test.csv", "train.csv", "evaluation.csv"]
    dfs = [pd.read_csv(os.path.join(folder, p), sep=';') for p in parts]
    datasets['GonzaloA'] = pd.concat(dfs, ignore_index=True)

    # 6. Hassan Amin - fake_or_real_news.csv
    datasets['Hassan_Amin'] = pd.read_csv(
        os.path.join(basepath, "Hassan Amin-fake_or_real_news.csv/fake_or_real_news.csv")
    )

    # 7. Meg Risdal_fake_only
    folder = os.path.join(basepath, "Meg Risdal_fake_only")
    df7 = pd.read_csv(os.path.join(folder, "fake.csv"))
    # Drop null titles and non-English
    if 'titel' in df7.columns:
        df7 = df7.dropna(subset=['titel'])
        df7 = df7.rename(columns={'titel': 'title'})
    if 'language' in df7.columns:
        df7 = df7[df7['language'].str.lower() == 'english']
    datasets['Meg_Risdal'] = df7

    # 8. Ruchi Bhatia_news_articles.csv
    df8 = pd.read_csv(os.path.join(basepath, "Ruchi Bhatia_news_articles.csv/news_articles.csv"))
    # Clean entries
    df8 = df8[~df8['title'].str.lower().isin(['no title', 'newsticker'])]
    df8 = df8[df8['text'].notna()]
    if 'language' in df8.columns:
        df8 = df8[df8['language'].str.lower() == 'english']
    datasets['Ruchi_Bhatia'] = df8

    # 9. Saurabh Shahane - Fake_News_Classification
    datasets['Saurabh_Shahane'] = pd.read_csv(
        os.path.join(basepath, "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")
    )

    return datasets

In [31]:
import os
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

base_path = '../../data'

datasets = load_datasets(base_path)

for name, df in datasets.items():
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")
    
# Berechne Übereinstimmungsmatrix
result = pd.DataFrame(index=datasets.keys(), columns=datasets.keys(), dtype=float)
dataset_names = list(datasets.keys())

for i, a in enumerate(dataset_names):
    for b in dataset_names[i+1:]:
        titles_a = datasets[a]
        titles_b = datasets[b]

        if len(titles_a) == 0:
            score_ab = 0.0
        else:
            inter_ab = len(titles_a.intersection(titles_b))
            score_ab = (inter_ab / len(titles_a)) * 100

        if len(titles_b) == 0:
            score_ba = 0.0
        else:
            inter_ba = len(titles_b.intersection(titles_a))
            score_ba = (inter_ba / len(titles_b)) * 100

        result.loc[a, b] = score_ab
        result.loc[b, a] = score_ba

for name in dataset_names:
    result.loc[name, name] = 100.0

result = result.round(2)

# Erstelle Annotationen mit Prozent und Anzahl nicht-übereinstimmender Einträge
annot = result.copy().astype(str)

for i, a in enumerate(dataset_names):
    for j, b in enumerate(dataset_names):
        if a == b:
            annot.loc[a, b] = f"100%\n0"
        else:
            titles_a = datasets[a]
            titles_b = datasets[b]
            inter_ab = titles_a.intersection(titles_b)
            unmatched_count = len(titles_a - titles_b)  # Aus Sicht von Dataset a
            percent = result.loc[a, b]
            annot.loc[a, b] = f"{percent:.2f}%\n{unmatched_count}"

# Plot
plt.figure(figsize=(15, 10))
sns.heatmap(result, annot=annot.values, fmt="", cmap='Blues', cbar_kws={'label': 'Übereinstimmung in %'})
plt.title('Vergleich der Titel-Übereinstimmung zwischen Datasets\n(Zahl unter Prozent: Anzahl nicht-übereinstimmender Titel)')
plt.xlabel('Dataset')
plt.ylabel('Dataset')
plt.tight_layout()
plt.show()


Aadya_Singh: 40587 rows, 4 columns
Bhavik_Jikadara: 44898 rows, 4 columns
clmentbisaillon: 44898 rows, 4 columns
ErfanMoosaviMonazzah: 44267 rows, 6 columns
GonzaloA: 40587 rows, 4 columns
Hassan_Amin: 6335 rows, 4 columns
Meg_Risdal: 12403 rows, 20 columns
Ruchi_Bhatia: 1786 rows, 12 columns
Saurabh_Shahane: 72134 rows, 4 columns


AttributeError: 'DataFrame' object has no attribute 'intersection'