# One-Class Fake News Classification - Jupyter Notebook

In [15]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline

In [17]:
# 2. Paths to dataset splits
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Read datasets
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Quick overviews
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

Train shape: (30000, 6)
Validation shape: (6000, 6)
Test shape: (8267, 6)


In [18]:
# 3. Merge title and text into a single feature

def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

In [19]:
# 4. Prepare training data for one-class model (only real news)
real_train = df_train[df_train['label'] == 1]['input_text']

# Validation and test inputs and true labels
X_val, y_val = df_val['input_text'], df_val['label']
X_test, y_test = df_test['input_text'], df_test['label']

In [20]:
# 5. Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=20_000,    # top terms
    ngram_range=(1,2),      # unigrams + bigrams
    stop_words='english'
)

In [27]:
# 6. Build pipeline: TF-IDF + IsolationForest
# Estimate contamination (proportion of fake news) from training labels
raw_cont = 1 - df_train['label'].mean()
contamination = min(raw_cont, 0.5) # fraction of fakes in train

pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('clf', IsolationForest(
        n_estimators=100,
        contamination=contamination,
        random_state=42
    ))
])

In [29]:
# 7. Fit on real news only
pipeline.fit(real_train)

In [31]:
# 8. Predict on validation set
# IsolationForest.predict -> 1 for inliers, -1 for outliers
val_raw = pipeline.predict(X_val)
# Map: inlier (1) -> real (1), outlier (-1) -> fake (0)
val_preds = np.where(val_raw == 1, 1, 0)

# Evaluation
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds, target_names=['Fake','Real']))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

Validation Accuracy: 0.5553333333333333
              precision    recall  f1-score   support

        Fake       0.56      0.60      0.58      3089
        Real       0.54      0.51      0.53      2911

    accuracy                           0.56      6000
   macro avg       0.55      0.55      0.55      6000
weighted avg       0.55      0.56      0.55      6000

Confusion Matrix:
 [[1854 1235]
 [1433 1478]]


In [32]:
# 9. Predict on test set
test_raw = pipeline.predict(X_test)
test_preds = np.where(test_raw == 1, 1, 0)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds, target_names=['Fake','Real']))

Test Accuracy: 0.5492923672432564
              precision    recall  f1-score   support

        Fake       0.56      0.60      0.58      4284
        Real       0.53      0.50      0.51      3983

    accuracy                           0.55      8267
   macro avg       0.55      0.55      0.55      8267
weighted avg       0.55      0.55      0.55      8267



In [None]:
# 10. Export predictions and misclassifications
# Predictions DataFrame
submission = pd.DataFrame({
    'text': df_test['text'],            # raw text
    'predicted_label': test_preds       # 0=fake, 1=real
})

# Misclassified examples
misclassified = pd.DataFrame({
    'text': df_test['text'],
    'true_label': y_test,
    'predicted_label': test_preds
})
misclassified = misclassified[misclassified['true_label'] != misclassified['predicted_label']]

print("First 5 misclassified samples:")
print(misclassified.head(5))

First 5 misclassified samples:
                                                text  true_label  \
0  Donald Trump isn t exactly a stranger to makin...           0   
1  Donald Trump’s U.S. election victory may creat...           1   
2  A couple of quick questions come to mind when ...           0   
6  Johnny Carson must be rolling over in his grav...           0   

   predicted_label  
0                1  
1                0  
2                1  
4                1  
6                1  
