# One-Class Fake News Detection with Multiple Anomaly Models

In [2]:
!pip install tensorflow



In [3]:
# 1. Imports
import pandas as pd
import numpy as np
from scipy.sparse import issparse

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class models
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.mixture import GaussianMixture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K

# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [4]:
# 2. Paths and Data Loading
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Read and sample 20% of each dataset
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Merge title and text
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

# Prepare one-class training data (only real news)
X_train = df_train.loc[df_train['label'] == 1, 'input_text']
X_val, y_val = df_val['input_text'], df_val['label']
X_test, y_test = df_test['input_text'], df_test['label']


In [5]:
# 3. Vectorize text
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)
X_test_vec  = vectorizer.transform(X_test)

In [6]:
contamination = min(1 - df_train['label'].mean(), 0.5)


In [7]:
def to_dense_if_needed(X):
    return X.toarray() if issparse(X) else X

In [8]:
def fit_model(name, model, X_train):
    X_train = to_dense_if_needed(X_train)

    if name == 'Autoencoder':
        model.fit(X_train, X_train, epochs=10, batch_size=32, verbose=0)
    elif name == 'GaussianMixture':
        model.fit(X_train)
    else:
        model.fit(X_train)

    return model

In [9]:
import numpy as np

def predict_validation(name, model, X_val):
    X_val = to_dense_if_needed(X_val)

    if name == 'Autoencoder':
        recon = model.predict(X_val)
        mse = np.mean(np.power(X_val - recon, 2), axis=1)
        threshold = np.percentile(mse, 80)
        preds = (mse < threshold).astype(int)
    elif name == 'GaussianMixture':
        scores = model.score_samples(X_val)
        threshold = np.percentile(scores, 20)
        preds = (scores > threshold).astype(int)
    else:
        preds = model.predict(X_val)
        preds = (preds == 1).astype(int)

    return preds


In [10]:

def evaluate_model(name, preds, y_true):
    print(f"== {name} ==")
    print(classification_report(y_true, preds, target_names=["Fake", "Real"]))

In [11]:
model_if = IsolationForest(n_estimators=100, contamination=contamination, random_state=42)
model_if = fit_model('IsolationForest', model_if, X_train_vec)
preds_if = predict_validation('IsolationForest', model_if, X_val_vec)
evaluate_model('IsolationForest', preds_if, y_val)


== IsolationForest ==
              precision    recall  f1-score   support

        Fake       0.56      0.60      0.58      3089
        Real       0.54      0.51      0.53      2911

    accuracy                           0.56      6000
   macro avg       0.55      0.55      0.55      6000
weighted avg       0.55      0.56      0.55      6000



In [None]:
from sklearn.svm import OneClassSVM

model_ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=contamination)
model_ocsvm = fit_model('OneClassSVM', model_ocsvm, X_train_vec)
preds_ocsvm = predict_validation('OneClassSVM', model_ocsvm, X_val_vec)
evaluate_model('OneClassSVM', preds_ocsvm, y_val)


In [None]:
from sklearn.neighbors import LocalOutlierFactor

model_lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination, novelty=True)
model_lof = fit_model('LocalOutlierFactor', model_lof, X_train_vec)
preds_lof = predict_validation('LocalOutlierFactor', model_lof, X_val_vec)
evaluate_model('LocalOutlierFactor', preds_lof, y_val)


In [None]:
from sklearn.decomposition import PCA

# 1. Fit PCA on dense TF‑IDF
X_train_dense = to_dense_if_needed(X_train_vec)
pca = PCA(n_components=0.95, random_state=42)
pca.fit(X_train_dense)

# 2. Compute reconstruction error on validation
X_val_dense = to_dense_if_needed(X_val_vec)
X_val_rec   = pca.inverse_transform(pca.transform(X_val_dense))
mse_val     = np.mean((X_val_dense - X_val_rec)**2, axis=1)

# 3. Threshold (e.g. bottom 80% = inliers)
thresh_pca  = np.percentile(mse_val, 80)
preds_pca   = (mse_val < thresh_pca).astype(int)

# 4. Evaluate
evaluate_model('PCA Reconstruction', preds_pca, y_val)


In [None]:
from sklearn.neighbors import NearestNeighbors

# 1. Fit KNN on dense TF‑IDF
X_train_dense = to_dense_if_needed(X_train_vec)
knn = NearestNeighbors(n_neighbors=5)
knn.fit(X_train_dense)

# 2. Compute avg dist to 5 neighbors for validation
X_val_dense = to_dense_if_needed(X_val_vec)
distances, _ = knn.kneighbors(X_val_dense)
avg_dist = distances.mean(axis=1)

# 3. Threshold (e.g. bottom 80% of distances considered inlier)
thresh_knn = np.percentile(avg_dist, 80)
preds_knn  = (avg_dist < thresh_knn).astype(int)

# 4. Evaluate
evaluate_model('KNN Distance', preds_knn, y_val)
