# One-Class Fake News Classification with Hyperparameter Tuning

In [23]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline


In [25]:
# 2. Paths to dataset splits
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Read datasets
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Merge title and text into a single feature
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

# Prepare data splits
X_train_raw, y_train = df_train['input_text'], df_train['label']
X_val_raw,   y_val   = df_val['input_text'],   df_val['label']
X_test_raw,  y_test  = df_test['input_text'],  df_test['label']

X_train_real_raw = X_train_raw[y_train == 1]

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import accuracy_score
import numpy as np

# 3) Define grid
param_grid = {
    'tfidf__max_features': [5_000, 10_000, 20_000],
    'tfidf__ngram_range' : [(1,1), (1,2)],
    'lof__n_neighbors'   : [5, 10, 20],
    'lof__contamination' : [0.01, 0.05, 0.1]
}

In [29]:

def evaluate_lof_params(params):
    # build vectorizer + LOF
    vec = TfidfVectorizer(
        max_features=params['tfidf__max_features'],
        ngram_range=params['tfidf__ngram_range'],
        stop_words='english'
    )
    lof = LocalOutlierFactor(
        n_neighbors=params['lof__n_neighbors'],
        contamination=params['lof__contamination'],
        novelty=True
    )
    # FIT on real-only data
    Xtr = vec.fit_transform(X_train_real_raw)
    lof.fit(Xtr.toarray())
    
    # EVAL on mixed val set
    Xv = vec.transform(X_val_raw)
    raw = lof.predict(Xv.toarray())        # +1=inlier (real), -1=outlier (fake)
    y_pred = np.where(raw == 1, 1, 0)
    
    return accuracy_score(y_val, y_pred)

In [None]:
# 5) brute‐force search
best_score  = -1
best_params = None
total = np.prod([len(v) for v in param_grid.values()])
i = 0

for mf in param_grid['tfidf__max_features']:
    for ngr in param_grid['tfidf__ngram_range']:
        for nn in param_grid['lof__n_neighbors']:
            for cont in param_grid['lof__contamination']:
                i += 1
                p = {
                    'tfidf__max_features': mf,
                    'tfidf__ngram_range': ngr,
                    'lof__n_neighbors': nn,
                    'lof__contamination': cont
                }
                score = evaluate_lof_params(p)
                print(f"[{i}/{total}] {p} → Val acc: {score:.4f}")
                if score > best_score:
                    best_score, best_params = score, p.copy()

print("\n✅ Best validation accuracy:", best_score)
print("📋 Best parameters:", best_params)

In [21]:

# --- final model retrain ---
final_vec = TfidfVectorizer(
    max_features=best_params['tfidf__max_features'],
    ngram_range=best_params['tfidf__ngram_range'],
    stop_words='english'
)
X_real_combined = pd.concat([X_train_real_raw, X_val_raw[y_val == 1]])  # only real from train+val
X_combined_vec = final_vec.fit_transform(X_real_combined)

final_clf = LocalOutlierFactor(
    n_neighbors=best_params['lof__n_neighbors'],
    contamination=best_params['lof__contamination'],
    novelty=True
)
final_clf.fit(X_combined_vec.toarray())

# PREDICT on test
X_test_vec = final_vec.transform(X_test_raw)
raw_test = final_clf.predict(X_test_vec.toarray())
test_preds = np.where(raw_test == 1, 1, 0)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Fake','Real']))

MemoryError: Unable to allocate 1.23 GiB for an array with shape (8267, 20000) and data type float64