# OCC One-class classification

### Laden von Datensatz WELFake

In [1]:
import pandas as pd
import numpy as np
import os

# One-class models
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



path = os.path.join("../../../data", "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")

df = pd.read_csv(path)
df = df.rename(columns={'Title': 'title', 'Text': 'text', 'Label': 'label'})
df = df[df['text'].notna() & df['title'].notna()]
df['label'] = 1 - df['label'].astype(int)  # Flip labels: 0 → 1, 1 → 0
df = df[['title', 'text', 'label']]

X = df['title'] + '\n' + df['text']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Vectorize text
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)

def evaluate_model(name, preds, y_true):
    print(f"== {name} ==")
    print(classification_report(y_true, preds, target_names=["Fake", "Real"]))

contamination = min(1 - df['label'].mean(), 0.5)

In [2]:
model_if = IsolationForest(n_estimators=100, contamination=contamination, random_state=42)
model_if.fit(X_train_vec)

preds_if = model_if.predict(X_val_vec)
preds_if = (preds_if == 1).astype(int)

evaluate_model('IsolationForest', preds_if, y_val)

== IsolationForest ==
              precision    recall  f1-score   support

        Fake       0.45      0.43      0.44      7302
        Real       0.42      0.44      0.43      7006

    accuracy                           0.43     14308
   macro avg       0.43      0.43      0.43     14308
weighted avg       0.44      0.43      0.43     14308



In [3]:
model_lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination, novelty=True)
model_lof.fit(X_train_vec)
preds_lof = model_lof.predict(X_val_vec)
preds_lof = (preds_lof == 1).astype(int)
evaluate_model('LocalOutlierFactor', preds_lof, y_val)

== LocalOutlierFactor ==
              precision    recall  f1-score   support

        Fake       0.53      0.51      0.52      7302
        Real       0.51      0.52      0.51      7006

    accuracy                           0.52     14308
   macro avg       0.52      0.52      0.52     14308
weighted avg       0.52      0.52      0.52     14308



In [5]:
from sklearn.decomposition import PCA
from scipy.sparse import issparse

# 1. Fit PCA on dense TF‑IDF 
X_train_dense = X_train_vec.toarray() if issparse(X_train_vec) else X_train_vec
pca = PCA(n_components=0.95, random_state=42)
pca.fit(X_train_dense)

# 2. Compute reconstruction error on validation 
X_val_dense = X_val_vec.toarray() if issparse(X_val_vec) else X_val_vec
X_val_rec   = pca.inverse_transform(pca.transform(X_val_dense))
mse_val     = np.mean((X_val_dense - X_val_rec)**2, axis=1)

# 3. Threshold (e.g. bottom 80% = inliers)
thresh_pca  = np.percentile(mse_val, 80)
preds_pca   = (mse_val < thresh_pca).astype(int)

# 4. Evaluate
evaluate_model('PCA Reconstruction', preds_pca, y_val)

MemoryError: Unable to allocate 11.9 GiB for an array with shape (1600140000,) and data type float64

In [None]:
from sklearn.neighbors import NearestNeighbors

# 1. Fit KNN on dense TF‑IDF
X_train_dense = X_train_vec.toarray() if issparse(X_train_vec) else X_train_vec
knn = NearestNeighbors(n_neighbors=5)
knn.fit(X_train_dense)

# 2. Compute avg dist to 5 neighbors for validation
X_val_dense = X_val_vec.toarray() if issparse(X_val_vec) else X_val_vec
distances, _ = knn.kneighbors(X_val_dense)
avg_dist = distances.mean(axis=1)

# 3. Threshold (e.g. bottom 80% of distances considered inlier)
thresh_knn = np.percentile(avg_dist, 80)
preds_knn  = (avg_dist < thresh_knn).astype(int)

# 4. Evaluate
evaluate_model('KNN Distance', preds_knn, y_val)