# One-Class Fake News Classification with Hyperparameter Tuning

In [3]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline


In [5]:
# 2. Paths to dataset splits
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Read datasets
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Merge title and text into a single feature
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

# Prepare data splits
y_train = df_train['input_text'][df_train['label'] == 1]  # only real news for one-class training
X_val, y_val = df_val['input_text'], df_val['label']
X_test, y_test = df_test['input_text'], df_test['label']



In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import accuracy_score
import numpy as np

# 3) Define grid
param_grid = {
    'tfidf__max_features': [5_000, 10_000, 20_000],
    'tfidf__ngram_range' : [(1,1), (1,2)],
    'lof__n_neighbors'   : [5, 10, 20],
    'lof__contamination' : [0.01, 0.05, 0.1]
}

In [13]:
# 4) Evaluation function
def evaluate_lof_params(params):
    # build vectorizer + LOF (novelty mode)
    vec = TfidfVectorizer(
        max_features=params['tfidf__max_features'],
        ngram_range=params['tfidf__ngram_range'],
        stop_words='english'
    )
    lof = LocalOutlierFactor(
        n_neighbors=params['lof__n_neighbors'],
        contamination=params['lof__contamination'],
        novelty=True
    )
    
    # fit on real‐only training
    Xtr = vec.fit_transform(y_train)            # X_train = real+fake ∪ but LOF novelty uses only X_train
    lof.fit(Xtr.toarray())                      # LOF requires dense when novelty=True
    
    # transform validation
    Xv = vec.transform(X_val)
    raw = lof.predict(Xv.toarray())             # +1=inlier→Real, -1=outlier→Fake
    y_pred = np.where(raw==1, 1, 0)
    
    return accuracy_score(y_val, y_pred)

In [15]:
# 5) brute‐force search
best_score  = -1
best_params = None
total = np.prod([len(v) for v in param_grid.values()])
i = 0

for mf in param_grid['tfidf__max_features']:
    for ngr in param_grid['tfidf__ngram_range']:
        for nn in param_grid['lof__n_neighbors']:
            for cont in param_grid['lof__contamination']:
                i += 1
                p = {
                    'tfidf__max_features': mf,
                    'tfidf__ngram_range': ngr,
                    'lof__n_neighbors': nn,
                    'lof__contamination': cont
                }
                score = evaluate_lof_params(p)
                print(f"[{i}/{total}] {p} → Val acc: {score:.4f}")
                if score > best_score:
                    best_score, best_params = score, p.copy()

print("\n✅ Best validation accuracy:", best_score)
print("📋 Best parameters:", best_params)

[1/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 5, 'lof__contamination': 0.01} → Val acc: 0.4933
[2/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 5, 'lof__contamination': 0.05} → Val acc: 0.5108
[3/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 5, 'lof__contamination': 0.1} → Val acc: 0.5472
[4/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 10, 'lof__contamination': 0.01} → Val acc: 0.4893
[5/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 10, 'lof__contamination': 0.05} → Val acc: 0.5030
[6/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 10, 'lof__contamination': 0.1} → Val acc: 0.5318
[7/54] {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'lof__n_neighbors': 20, 'lof__contamination': 0.01} → Val acc: 0.4850
[8/54] {'tfidf__max_features': 5000, 'tfidf__n

In [21]:
from sklearn.neighbors import LocalOutlierFactor

# 5. Final model training with best params on train+val and evaluation on test
# Combine train (real) and val (all) to retrain
combined_texts = pd.concat([y_train, X_val])

# Vectorizer with best parameters
final_vec = TfidfVectorizer(
    max_features=20000,
    ngram_range=best_params['tfidf__ngram_range'],
    stop_words='english'
)

# LocalOutlierFactor with best parameters (in novelty mode)
final_clf = LocalOutlierFactor(
    n_neighbors=best_params['lof__n_neighbors'],
#    contamination=best_params['lof__contamination'],
    contamination=0.5,
    novelty=True  # required for .predict() on new data
)

# Fit vectorizer on combined data
X_combined_vec = final_vec.fit_transform(combined_texts)

# LOF requires dense input when novelty=True
final_clf.fit(X_combined_vec.toarray())

# Transform and predict on test data
X_test_vec = final_vec.transform(X_test)
raw_test_pred = final_clf.predict(X_test_vec.toarray())  # +1=inlier, -1=outlier
test_preds = np.where(raw_test_pred == 1, 1, 0)  # 1=Real, 0=Fake

# Evaluation
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Fake','Real']))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))


MemoryError: Unable to allocate 1.23 GiB for an array with shape (8267, 20000) and data type float64