# OCC One-class classification
Laden von Datensatz WELFake

In [27]:
import pandas as pd
import numpy as np
import os

# One-class models
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



path = os.path.join("../src/data", "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")

df = pd.read_csv(path)
df = df.rename(columns={'Title': 'title', 'Text': 'text', 'Label': 'label'})
df = df[df['text'].notna() & df['title'].notna()]
df['label'] = 1 - df['label'].astype(int)  # Flip labels: 0 → 1, 1 → 0
df = df[['title', 'text', 'label']]

df = df.sample(frac=0.2, random_state=42)

X = df['title'] + '\n' + df['text']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Vectorize text
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)

def evaluate_model(name, preds, y_true):
    print(f"== {name} ==")
    print(classification_report(y_true, preds, target_names=["Fake", "Real"]))

contamination = min(1 - df['label'].mean(), 0.5)

## Isolation forest

In [28]:
model_if = IsolationForest(n_estimators=100, contamination=contamination, random_state=42)
model_if.fit(X_train_vec)

preds_if = model_if.predict(X_val_vec)
preds_if = (preds_if == 1).astype(int)

evaluate_model('IsolationForest', preds_if, y_val)

== IsolationForest ==
              precision    recall  f1-score   support

        Fake       0.45      0.43      0.44      7302
        Real       0.42      0.44      0.43      7006

    accuracy                           0.43     14308
   macro avg       0.43      0.43      0.43     14308
weighted avg       0.44      0.43      0.43     14308



## Local Outlier Factor

In [29]:
model_lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination, novelty=True)
model_lof.fit(X_train_vec)
preds_lof = model_lof.predict(X_val_vec)
preds_lof = (preds_lof == 1).astype(int)
evaluate_model('LocalOutlierFactor', preds_lof, y_val)

== LocalOutlierFactor ==
              precision    recall  f1-score   support

        Fake       0.53      0.51      0.52      7302
        Real       0.51      0.52      0.51      7006

    accuracy                           0.52     14308
   macro avg       0.52      0.52      0.52     14308
weighted avg       0.52      0.52      0.52     14308



## Nearest Neighbors

In [30]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# 1) Fit KNN with sparse inputs and cosine metric
knn = NearestNeighbors(n_neighbors=5)
knn.fit(X_train_vec)

# 2) Compute distances (sparse→sparse)
distances, _ = knn.kneighbors(X_val_vec)

# 3) Average distance as anomaly score
avg_dist = distances.mean(axis=1)

# 4) Threshold top 20% as outliers
thresh_knn = np.percentile(avg_dist, 80)
preds_knn  = (avg_dist < thresh_knn).astype(int)

# 5) Evaluate
evaluate_model('KNN Distance', preds_knn, y_val)


== KNN Distance ==
              precision    recall  f1-score   support

        Fake       0.59      0.23      0.33      7302
        Real       0.51      0.83      0.63      7006

    accuracy                           0.52     14308
   macro avg       0.55      0.53      0.48     14308
weighted avg       0.55      0.52      0.48     14308



# Local Outlier Factor (LOF) Hyperparametertuning
## Local Outlier Factor

### TODO Erklärung LOF

# Grid Search

In [32]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import LocalOutlierFactor
import os

from scipy.stats import randint, uniform
from sklearn.experimental import enable_halving_search_cv  # Needed to enable
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

path = os.path.join("../src/data", "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")

df = pd.read_csv(path)
df = df.sample(frac=0.1, random_state=42)
df = df.rename(columns={'Title': 'title', 'Text': 'text', 'Label': 'label'})
df = df[df['text'].notna() & df['title'].notna()]
df['label'] = 1 - df['label'].astype(int)  # Flip labels: 0 → 1, 1 → 0
df = df[['title', 'text', 'label']]

X = df['title'] + '\n' + df['text']
y = df['label']

# 2) Split into train / temp (30%) then val/test (each 15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

# 3) Build the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    # novelty=True lets LOF be used for prediction on unseen data
    ('lof', LocalOutlierFactor(novelty=True))
])

# 4) Evaluation function
def evaluate_lof_params(params):
    # build vectorizer + LOF (novelty mode)
    vec = TfidfVectorizer(
        max_features=params['tfidf__max_features'],
        ngram_range=params['tfidf__ngram_range'],
        stop_words='english'
    )
    lof = LocalOutlierFactor(
        n_neighbors=params['lof__n_neighbors'],
        contamination=params['lof__contamination'],
        novelty=True
    )
    
    # fit on real‐only training
    Xtr = vec.fit_transform(X_train)            # X_train = real+fake ∪ but LOF novelty uses only X_train
    lof.fit(Xtr.toarray())                      # LOF requires dense when novelty=True
    
    # transform validation
    Xv = vec.transform(X_val)
    raw = lof.predict(Xv.toarray())             # +1=inlier→Real, -1=outlier→Fake
    y_pred = np.where(raw==1, 1, 0)
    
    return accuracy_score(y_val, y_pred)

In [33]:
param_grid = {
    'tfidf__max_features': [20_000, 30_000],
    'lof__n_neighbors'   : [20, 40],
    'lof__contamination' : [0.4, 0.5]
}

In [None]:
best_score  = -1
best_params = None
total = np.prod([len(v) for v in param_grid.values()])
i = 0

for mf in param_grid['tfidf__max_features']:
        for nn in param_grid['lof__n_neighbors']:
            for cont in param_grid['lof__contamination']:
                i += 1
                p = {
                    'tfidf__max_features': mf,
                    'tfidf__ngram_range': (1,1),
                    'lof__n_neighbors': nn,
                    'lof__contamination': cont
                }
                score = evaluate_lof_params(p)
                print(f"[{i}/{total}] {p} → Val acc: {score:.4f}")
                if score > best_score:
                    best_score, best_params = score, p.copy()

print("\nBest validation accuracy:", best_score)
print("Best parameters:", best_params)

[1/8] {'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 20, 'lof__contamination': 0.4} → Val acc: 0.5205
[2/8] {'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 20, 'lof__contamination': 0.5} → Val acc: 0.5419
[3/8] {'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 40, 'lof__contamination': 0.4} → Val acc: 0.5493
[4/8] {'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 40, 'lof__contamination': 0.5} → Val acc: 0.5549
[5/8] {'tfidf__max_features': 30000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 20, 'lof__contamination': 0.4} → Val acc: 0.5242


# TODO: es gibt noch halving search, random search und das alles nach Cross validation nennen/erklären

# Übergabe zu Vladi
# TOOD platzhalter damit weiß ist einfügen

# Auswahl der Datensätze zur Validierung TODO anpassne der diagramme damit sie gut zu lesen sind

In [None]:
import os
import pandas as pd

from datasets import load_dataset
import os
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
def load_datasets(basepath):
    datasets = {}

    # 1. Aadya Singh_Fakenews: evaluation.csv, test (1).csv, train (2).csv
    folder = os.path.join(basepath, "Aadya Singh  _fake-and_real_news")
    files = ["evaluation.csv", "test (1).csv", "train (2).csv"]
    dfs = [pd.read_csv(os.path.join(folder, f), sep=';') for f in files]
    datasets['A'] = pd.concat(dfs, ignore_index=True)

    # 3. clmentbisaillon_Fakenews
    folder = os.path.join(basepath, "clmentbisaillon_Fakenews")
    true = pd.read_csv(os.path.join(folder, "True.csv"))
    fake = pd.read_csv(os.path.join(folder, "Fake.csv"))
    df3 = pd.concat([true, fake], ignore_index=True)
    if 'text' in df3.columns:
        df3 = df3[df3['text'] != "[empty]"]
    datasets['B'] = df3

    # 6. Hassan Amin - fake_or_real_news.csv
    datasets['C'] = pd.read_csv(
        os.path.join(basepath, "Hassan Amin-fake_or_real_news.csv/fake_or_real_news.csv")
    )

    # 7. Meg Risdal_fake_only
    folder = os.path.join(basepath, "Meg Risdal_fake_only")
    df7 = pd.read_csv(os.path.join(folder, "fake.csv"))
    # Drop null titles and non-English
    if 'titel' in df7.columns:
        df7 = df7.dropna(subset=['titel'])
        df7 = df7.rename(columns={'titel': 'title'})
    if 'language' in df7.columns:
        df7 = df7[df7['language'].str.lower() == 'english']
    datasets['D'] = df7
    
    # 8. Ruchi Bhatia_news_articles.csv
    df8 = pd.read_csv(os.path.join(basepath, "Ruchi Bhatia_news_articles.csv/news_articles.csv"))
    # Clean entries
    df8 = df8[~df8['title'].str.lower().isin(['no title', 'newsticker'])]
    df8 = df8[df8['text'].notna()]
    if 'language' in df8.columns:
        df8 = df8[df8['language'].str.lower() == 'english']
    datasets['E'] = df8
    
    # 9. Saurabh Shahane - Fake_News_Classification
    datasets['F'] = pd.read_csv(
        os.path.join(basepath, "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")
    )

    # 10. andyP/fake_news_en_opensources
    datasets['G'] = pd.read_csv(
        os.path.join(basepath, "bigFakeNews", "dataFiltered.csv")
    )

    return datasets

base_path = '../../data'
datasets = load_datasets(base_path)
# Extract title sets from each DataFrame
title_sets = {}
for name, df in datasets.items():
    if 'title' in df.columns:
        title_sets[name] = set(df['title'].dropna().astype(str).str.strip().str.lower())
    else:
        title_sets[name] = set()
# Initialize result matrix
dataset_names = list(title_sets.keys())
result = pd.DataFrame(index=dataset_names,
                      columns=dataset_names,
                      dtype=float)
# Compute pairwise overlap percentages
for i, a in enumerate(dataset_names):
    for b in dataset_names[i+1:]:
        set_a = title_sets[a]
        set_b = title_sets[b]
        score_ab = (len(set_a & set_b) / len(set_a) * 100) if set_a else 0.0
        score_ba = (len(set_b & set_a) / len(set_b) * 100) if set_b else 0.0
        result.loc[a, b] = score_ab
        result.loc[b, a] = score_ba

# Fill diagonal with 100%
for n in dataset_names:
    result.loc[n, n] = 100.0

result = result.round(2)

# Create annotations with percent and unmatched counts
annot = pd.DataFrame(index=dataset_names, columns=dataset_names)
for a in dataset_names:
    for b in dataset_names:
        if a == b:
            annot.loc[a, b] = "100%\n0"
        else:
            set_a = title_sets[a]
            set_b = title_sets[b]
            inter = set_a & set_b
            unmatched = len(set_a - set_b)
            annot.loc[a, b] = f"{result.loc[a,b]:.2f}%\n{unmatched}"

# Plot heatmap

plt.figure(figsize=(15, 10))
sns.heatmap(result.astype(float), annot=annot.values, fmt="",
            cmap='Blues', cbar_kws={'label': 'Übereinstimmung in %'})
plt.title('Vergleich der Titel-Übereinstimmung zwischen Datasets\n'
          '(Zahl unter Prozent: Anzahl nicht-übereinstimmender Titel)')
plt.xlabel('Dataset')
plt.ylabel('Dataset')
plt.tight_layout()
plt.show()



## Enternung von Datensatz A, B, C wegen überschneidungen

In [None]:
remove_keys = ['A', 'B', 'C']
result = result.drop(index=remove_keys, columns=remove_keys)
annot  = annot .drop(index=remove_keys, columns=remove_keys)

# now plot as before...
plt.figure(figsize=(15, 10))
sns.heatmap(result.astype(float), annot=annot.values, fmt="",
            cmap='Blues', cbar_kws={'label': 'Übereinstimmung in %'})
plt.title('Vergleich der Titel-Übereinstimmung zwischen Datasets\n'
          '(Zahl unter Prozent: Anzahl nicht-übereinstimmender Titel)')
plt.xlabel('Dataset')
plt.ylabel('Dataset')
plt.tight_layout()
plt.show()