# Random Forest - GonzaloA

In [2]:
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifier and evaluation metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities (optional)
from sklearn.pipeline import Pipeline

In [3]:
# Paths to your new dataset splits
train_path = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv"
val_path   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv"
test_path  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"

# Read with tab separator and parse the `date` column
df_train = pd.read_csv(train_path, sep='\t', parse_dates=["date"], dayfirst=False)
df_val   = pd.read_csv(val_path,   sep='\t', parse_dates=["date"], dayfirst=False)
df_test  = pd.read_csv(test_path,  sep='\t', parse_dates=["date"], dayfirst=False)

# Quick check
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

# Inspect columns
print(df_train.columns.tolist())


Train shape: (30000, 6)
Validation shape: (6000, 6)
Test shape: (8267, 6)
['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label']


In [4]:
# Display shape and columns of each DataFrame
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
print()
print("Train columns:", df_train.columns.tolist())

# Display the first 5 rows of the training set
print(df_train.head(5))

Train shape: (30000, 6)
Validation shape: (6000, 6)
Test shape: (8267, 6)

Train columns: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label']
   Unnamed: 0                                              title  \
0        2619  Ex-CIA head says Trump remarks on Russia inter...   
1       16043  YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...   
2         876  Federal Reserve governor Powell's policy views...   
3       19963  SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...   
4       10783  NANCY PELOSI ARROGANTLY DISMISSES Questions on...   

                                                text          subject  \
0  Former CIA director John Brennan on Friday cri...     politicsNews   
1  How did this man come to OWN this store? There...  Government News   
2  President Donald Trump on Thursday tapped Fede...     politicsNews   
3  Hillary Clinton ally David Brock is offering t...        left-news   
4  Pleading ignorance is a perfect ploy for Nancy...         politics   

  

In [5]:
# Merge text fields
def merge_text(row):
    return f"{row['title']} \n{row['text']}"

for df in [df_train, df_val, df_test]:
    df['input_text'] = df.apply(merge_text, axis=1)

# Define X and y
X_train, y_train = df_train['input_text'], df_train['label']
X_val,   y_val   = df_val['input_text'],   df_val['label']
X_test,  y_test  = df_test['input_text'],  df_test['label']  # y_test only for evaluation

In [6]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=20_000,    # Keep only the top 20,000 most frequent terms to limit dimensionality
    ngram_range=(1,2),      # Include both unigrams (single words) and bigrams (pairs of words)
    stop_words='english'    # Remove common English stop words (e.g., 'the', 'and')
)

# Fit the vectorizer on the training data and transform texts to sparse matrices
# fit_transform learns the vocabulary from X_train and then transforms X_train
X_train_tfidf = vectorizer.fit_transform(X_train)

# Use the already-fit vectorizer to transform validation and test sets
# transform uses the learned vocabulary to convert new texts into vectors
X_val_tfidf   = vectorizer.transform(X_val)
X_test_tfidf  = vectorizer.transform(X_test)

In [7]:
pipeline = Pipeline([
    ('tfidf', vectorizer),                  # transform raw text to TF-IDF features
    ('clf', RandomForestClassifier(
        n_estimators=100,                   # number of trees in the forest
        max_depth=None,                      # allow trees to grow until all leaves are pure
        random_state=42                      # ensure reproducibility
    ))
])

# Train the Random Forest pipeline on the training set
pipeline.fit(X_train, y_train)

In [8]:
# Predict
val_preds = pipeline.predict(X_val)

# Metrics
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds, target_names=['Real','Fake']))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

Validation Accuracy: 0.9863333333333333
              precision    recall  f1-score   support

        Real       0.99      0.98      0.99      3089
        Fake       0.98      0.99      0.99      2911

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000

Confusion Matrix:
 [[3036   53]
 [  29 2882]]


In [9]:
# Generate predictions on the test set using our trained pipeline
# `test_preds` will be an array of predicted labels: 0 = real, 1 = fake
test_preds = pipeline.predict(X_test)

# If the true labels for the test split (`y_test`) are available, evaluate performance:
# Print overall accuracy to see the fraction of correct predictions
print("Test Accuracy:", accuracy_score(y_test, test_preds))

# Print a detailed classification report (precision, recall, F1-score) for each class
print(classification_report(
    y_test,             # True labels
    test_preds,         # Predicted labels
    target_names=['Real','Fake']
))

Test Accuracy: 0.9879037135599371
              precision    recall  f1-score   support

        Real       0.99      0.99      0.99      4284
        Fake       0.98      0.99      0.99      3983

    accuracy                           0.99      8267
   macro avg       0.99      0.99      0.99      8267
weighted avg       0.99      0.99      0.99      8267



In [10]:
# Create a DataFrame pairing each test article with its predicted label
submission = pd.DataFrame({
    'text': df_test['text'],            # Raw text from the test set
    'predicted_label': test_preds       # Model output: 0 = real, 1 = fake
})
# Export the predictions to CSV for review or submission (no index column)
submission.to_csv('fake_news_predictions.csv', index=False)

# ---
# Create a DataFrame pairing each test article with its true and predicted labels
misclassified = pd.DataFrame({
    'text': df_test['text'],        # Raw text from the test set
    'true_label': y_test,           # True labels: 0 = real, 1 = fake
    'predicted_label': test_preds   # Predicted labels from the model
})
# Filter only the misclassified examples
misclassified = misclassified[misclassified['true_label'] != misclassified['predicted_label']]

# Save these misclassified samples to a separate CSV
misclassified.to_csv('misclassified_samples.csv', index=False)

# Print the first 5 misclassified samples directly
print("Misclassified samples:")
print(misclassified.head(5))

Misclassified samples:
                                                  text  true_label  \
138  The  Great Firewall of China    the world s mo...           1   
186  Liberian President Ellen Johnson Sirleaf said ...           1   
235  Britain s Prince Harry and U.S. actress Meghan...           1   
373  President-elect Donald Trump is throwing a pri...           0   
398  William Jackson remembers the exodus vividly, ...           1   

     predicted_label  
138                0  
186                0  
235                0  
373                1  
398                0  
