In [5]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline

In [7]:
# 1. Load data
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"])  
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"])  
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"])  

def merge_text(row):
    return f"{row['title']} \n{row['text']}"
for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

In [9]:

# 2. Prepare data and labels
y_train = df_train['label']
X_train_raw = df_train['input_text']
y_val = df_val['label']
X_val_raw = df_val['input_text']
y_test = df_test['label']
X_test_raw = df_test['input_text']

In [11]:
# 3. Vectorize text with TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train = vectorizer.fit_transform(X_train_raw)
X_val   = vectorizer.transform(X_val_raw)
X_test  = vectorizer.transform(X_test_raw)

In [16]:
from sklearn.neighbors import LocalOutlierFactor
# 4. Train Local Outlier Factor
# Set contamination to fraction of fake samples in the training set
def compute_contamination(y):
    return sum(y == 1) / len(y)
contamination = compute_contamination(y_train)

lof = LocalOutlierFactor(n_neighbors=20,
                         contamination=contamination,
                         novelty=True)
lof.fit(X_train)


In [18]:
# 5. Predict on validation and test sets
def predict_labels(model, X):
    # LOF.predict returns 1 for inliers, -1 for outliers
    raw = model.predict(X)
    # Map -1 (outlier) to fake (1), 1 (inlier) to real (0)
    return [1 if x == -1 else 0 for x in raw]

y_val_pred = predict_labels(lof, X_val)
y_test_pred = predict_labels(lof, X_test)

In [19]:

# 6. Evaluate performance
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=['Real','Fake']))

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, target_names=['Real','Fake']))

Validation Accuracy: 0.5038333333333334
              precision    recall  f1-score   support

        Real       0.52      0.52      0.52      3089
        Fake       0.49      0.49      0.49      2911

    accuracy                           0.50      6000
   macro avg       0.50      0.50      0.50      6000
weighted avg       0.50      0.50      0.50      6000

Test Accuracy: 0.5011491472118059
              precision    recall  f1-score   support

        Real       0.52      0.51      0.51      4284
        Fake       0.48      0.49      0.49      3983

    accuracy                           0.50      8267
   macro avg       0.50      0.50      0.50      8267
weighted avg       0.50      0.50      0.50      8267

