# One-Class Fake News Classification with Hyperparameter Tuning

In [2]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline


In [3]:
# 2. Paths to dataset splits
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Read datasets
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Merge title and text into a single feature
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

# Prepare data splits
y_train = df_train['input_text'][df_train['label'] == 1]  # only real news for one-class training
X_val, y_val = df_val['input_text'], df_val['label']
X_test, y_test = df_test['input_text'], df_test['label']



In [4]:
# 3. Hyperparameter grid definitions
param_grid = {
    'tfidf__max_features': [5000, 10000, 20000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__n_estimators': [50, 100, 200],
    'clf__contamination': [0.1, 0.2, 0.3]
}


In [None]:
# 4. Manual grid search over validation set
def evaluate_params(params):
    # Build pipeline with given params
    vec = TfidfVectorizer(
        max_features=params['tfidf__max_features'],
        ngram_range=params['tfidf__ngram_range'],
        stop_words='english'
    )
    clf = IsolationForest(
        n_estimators=params['clf__n_estimators'],
        contamination=params['clf__contamination'],
        random_state=42
    )
    # Fit on training (real news) and predict on val
    X_train_vec = vec.fit_transform(y_train)
    X_val_vec = vec.transform(X_val)
    raw_pred = clf.fit(X_train_vec).predict(X_val_vec)
    y_pred = np.where(raw_pred == 1, 1, 0)
    # Compute accuracy
    return accuracy_score(y_val, y_pred)

# Iterate grid
total = np.prod([len(v) for v in param_grid.values()])
best_score = 0
best_params = None
count = 0
for max_f in param_grid['tfidf__max_features']:
    for ngram in param_grid['tfidf__ngram_range']:
        for n_est in param_grid['clf__n_estimators']:
            for cont in param_grid['clf__contamination']:
                params = {
                    'tfidf__max_features': max_f,
                    'tfidf__ngram_range': ngram,
                    'clf__n_estimators': n_est,
                    'clf__contamination': cont
                }
                count += 1
                score = evaluate_params(params)
                print(f"[{count}/{total}] Params: {params} -> Val Accuracy: {score:.4f}")
                if score > best_score:
                    best_score = score
                    best_params = params.copy()

print("\nBest validation score:", best_score)
print("Best parameters:", best_params)


[1/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimators': 50, 'clf__contamination': 0.1} -> Val Accuracy: 0.5120
[2/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimators': 50, 'clf__contamination': 0.2} -> Val Accuracy: 0.5388
[3/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimators': 50, 'clf__contamination': 0.3} -> Val Accuracy: 0.5508
[4/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimators': 100, 'clf__contamination': 0.1} -> Val Accuracy: 0.5230
[5/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimators': 100, 'clf__contamination': 0.2} -> Val Accuracy: 0.5477
[6/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimators': 100, 'clf__contamination': 0.3} -> Val Accuracy: 0.5683
[7/54] Params: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'clf__n_estimato

In [None]:
# 5. Final model training with best params on train+val and evaluation on test
# Combine train (real) and val (all) to retrain
combined_texts = pd.concat([y_train, X_val])

# Vectorizer and classifier with best parameters
final_vec = TfidfVectorizer(
    max_features=best_params['tfidf__max_features'],
    ngram_range=best_params['tfidf__ngram_range'],
    stop_words='english'
)
final_clf = IsolationForest(
    n_estimators=best_params['clf__n_estimators'],
    contamination=best_params['clf__contamination'],
    random_state=42
)

# Fit on combined (real + val)
X_combined_vec = final_vec.fit_transform(combined_texts)
final_clf.fit(X_combined_vec)

# Evaluate on test
X_test_vec = final_vec.transform(X_test)
raw_test_pred = final_clf.predict(X_test_vec)
test_preds = np.where(raw_test_pred == 1, 1, 0)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Fake','Real']))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))



In [None]:
# 6. Save predictions and misclassified samples
submission = pd.DataFrame({
    'text': df_test['text'],
    'predicted_label': test_preds
})


misclassified = pd.DataFrame({
    'text': df_test['text'],
    'true_label': y_test,
    'predicted_label': test_preds
})
misclassified = misclassified[misclassified['true_label'] != misclassified['predicted_label']]


print("First 5 misclassified samples:")
print(misclassified.head(5))
