# Fake News Analysis and Classification (Google Colab edition - TSV Dataset)

This notebook performs exploratory data analysis (EDA) on the LIAR fake news dataset and evaluates different classification models. 
This notebook is designed to run on Google Colab to leverage GPU acceleration for training our fake news detection models, particularly the DistilBERT transformer model which benefits significantly from GPU processing.

In [None]:
# Clone the repository
!git clone https://github.com/smaoui-me/fake-news-predictor.git

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

sys.path.append('/content/fake-news-predictor')  # Path to the repo root

from models import load_data, LogisticModel, DistilBertModel

# Check if GPU is available
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

## Upload the TSV Dataset Files

You need to upload the train.tsv, test.tsv, and valid.tsv files to proceed with this notebook.

In [None]:
from google.colab import files

# Create data directory if it doesn't exist
os.makedirs('/content/fake-news-predictor/data', exist_ok=True)

# Upload the TSV files
uploaded = files.upload()

# Save the uploaded files to the data directory
for filename in uploaded.keys():
    if filename in ['train.tsv', 'test.tsv', 'valid.tsv']:
        with open(f'/content/fake-news-predictor/data/{filename}', 'wb') as f:
            f.write(uploaded[filename])
        print(f'Saved {filename} to /content/fake-news-predictor/data/')

## 1. Load and Explore the Dataset

In [None]:
# Load the dataset from TSV files
data = load_data('/content/fake-news-predictor/data', use_predefined_splits=True)
df = data['full_data']

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"Original label distribution:{df['label'].value_counts()}")
print(f"Original label distribution (percentage):{df['label'].value_counts(normalize=True) * 100}")

print(f"Binary label distribution:{df['binary_label'].value_counts()}")
print(f"Binary label distribution (percentage):{df['binary_label'].value_counts(normalize=True) * 100}")

# Display split sizes
print(f"Train set size: {len(data['train']['df'])}")
print(f"Test set size: {len(data['test']['df'])}")
print(f"Validation set size: {len(data['valid']['df'])}")

In [None]:
# Display sample data
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Text length distribution
df['statement_length'] = df['statement'].apply(len)

plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='statement_length', hue='binary_label', bins=50, kde=True)
plt.title('Statement Length Distribution by Class')
plt.xlabel('Statement Length')
plt.ylabel('Count')
plt.legend(['Real News (0)', 'Fake News (1)'])
plt.show()

In [None]:
# 1. Check for missing or null values
print("Checking for missing values:")
missing_values = df.isnull().sum()
print(missing_values)
    
# 2. Check label distribution
print("Original label distribution:")
print(df['label'].value_counts())
print("Unique labels:", df['label'].unique())

print("Binary label distribution:")
print(df['binary_label'].value_counts())

# 3. Check for duplicates
duplicate_count = df.duplicated(['statement', 'binary_label']).sum()
print(f"Number of duplicate entries (same statement and label): {duplicate_count}")

# Clean the data
# Remove missing values if any
df = df.dropna(subset=['statement', 'binary_label'])

# Remove duplicates if any
df = df.drop_duplicates(subset=['statement', 'binary_label'])

# Show cleaned data stats
print("After cleaning:")
print(f"Total samples: {len(df)}")
print("Binary label distribution after cleaning:")
print(df['binary_label'].value_counts())

# Update the data dictionary with cleaned data
data['full_data'] = df

## 3. Train and Evaluate Logistic Regression Model

In [None]:
# Initialize and train the Logistic Regression model
logistic_model = LogisticModel(max_features=10000, preprocess=True)
logistic_model.train(data['train']['texts'], data['train']['labels'])

# Evaluate the model on test set
logistic_metrics = logistic_model.evaluate(data['test']['texts'], data['test']['labels'])

# Display metrics
print("Logistic Regression Model Metrics on Test Set:")
for metric, value in logistic_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric.capitalize()}: {value:.4f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    logistic_metrics['confusion_matrix'],
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Real', 'Fake'],
    yticklabels=['Real', 'Fake']
)
plt.title('Confusion Matrix - Logistic Regression (Test Set)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Evaluate on validation set
logistic_val_metrics = logistic_model.evaluate(data['valid']['texts'], data['valid']['labels'])

# Display validation metrics
print("Logistic Regression Model Metrics on Validation Set:")
for metric, value in logistic_val_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric.capitalize()}: {value:.4f}")

# Save the model
os.makedirs('/content/models', exist_ok=True)
logistic_model.save('/content/models/logistic_model.pkl')

## 4. Train and Evaluate DistilBERT Model

In [None]:
# Initialize and train the DistilBERT model
# Note: This may take some time to run
distilbert_model = DistilBertModel(max_length=128, batch_size=16, epochs=2)
distilbert_model.train(data['train']['texts'], data['train']['labels'])

# Evaluate the model on test set
distilbert_metrics = distilbert_model.evaluate(data['test']['texts'], data['test']['labels'])

# Display metrics
print("DistilBERT Model Metrics on Test Set:")
for metric, value in distilbert_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric.capitalize()}: {value:.4f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    distilbert_metrics['confusion_matrix'],
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Real', 'Fake'],
    yticklabels=['Real', 'Fake']
)
plt.title('Confusion Matrix - DistilBERT (Test Set)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Evaluate on validation set
distilbert_val_metrics = distilbert_model.evaluate(data['valid']['texts'], data['valid']['labels'])

# Display validation metrics
print("DistilBERT Model Metrics on Validation Set:")
for metric, value in distilbert_val_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric.capitalize()}: {value:.4f}")

# Save the model
distilbert_model.save('/content/models/distilbert_model')

## 5. Compare Model Performance

In [None]:
# Compare model performance on test set
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
models = ['Logistic Regression', 'DistilBERT']
test_performance = {
    'Logistic Regression': [logistic_metrics[metric] for metric in metrics],
    'DistilBERT': [distilbert_metrics[metric] for metric in metrics]
}

# Create a DataFrame for test set comparison
test_performance_df = pd.DataFrame(test_performance, index=metrics)
test_performance_df.index = [metric.capitalize() for metric in metrics]

# Display the test set comparison
print("Model Performance Comparison on Test Set:")
test_performance_df

In [None]:
# Compare model performance on validation set
val_performance = {
    'Logistic Regression': [logistic_val_metrics[metric] for metric in metrics],
    'DistilBERT': [distilbert_val_metrics[metric] for metric in metrics]
}

# Create a DataFrame for validation set comparison
val_performance_df = pd.DataFrame(val_performance, index=metrics)
val_performance_df.index = [metric.capitalize() for metric in metrics]

# Display the validation set comparison
print("Model Performance Comparison on Validation Set:")
val_performance_df

In [None]:
# Plot the test set comparison
plt.figure(figsize=(12, 6))
test_performance_df.plot(kind='bar', ax=plt.gca())
plt.title('Model Performance Comparison - Test Set')
plt.xlabel('Metric')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.legend(title='Model')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Plot the validation set comparison
plt.figure(figsize=(12, 6))
val_performance_df.plot(kind='bar', ax=plt.gca())
plt.title('Model Performance Comparison - Validation Set')
plt.xlabel('Metric')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.legend(title='Model')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Combined plot for both test and validation
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

test_performance_df.plot(kind='bar', ax=axes[0])
axes[0].set_title('Test Set Performance')
axes[0].set_xlabel('Metric')
axes[0].set_ylabel('Score')
axes[0].set_ylim(0, 1)
axes[0].legend(title='Model')
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

val_performance_df.plot(kind='bar', ax=axes[1])
axes[1].set_title('Validation Set Performance')
axes[1].set_xlabel('Metric')
axes[1].set_ylabel('Score')
axes[1].set_ylim(0, 1)
axes[1].legend(title='Model')
axes[1].grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.suptitle('Model Performance Comparison', fontsize=16, y=1.05)
plt.show()

## 6. Example Predictions

In [None]:
# Get some examples from the test set
examples = data['test']['texts'].iloc[:5].tolist()
true_labels = data['test']['labels'].iloc[:5].tolist()

# Get additional metadata for context
test_df = data['test']['df'].iloc[:5]
speakers = test_df['speaker'].tolist()
contexts = test_df['context'].tolist()
original_labels = test_df['label'].tolist()

# Make predictions with both models
logistic_preds = logistic_model.predict(examples)
distilbert_preds = distilbert_model.predict(examples)

# Map binary labels to text for better readability
label_text_map = {0: 'Real', 1: 'Fake'}
true_labels_text = [label_text_map[label] for label in true_labels]
logistic_preds_text = [label_text_map[pred] for pred in logistic_preds]
distilbert_preds_text = [label_text_map[pred] for pred in distilbert_preds]

# Display the results
results = pd.DataFrame({
    'Statement': [text[:100] + '...' for text in examples],
    'Speaker': speakers,
    'Context': contexts,
    'Original Label': original_labels,
    'Binary Label': true_labels_text,
    'Logistic Prediction': logistic_preds_text,
    'DistilBERT Prediction': distilbert_preds_text
})

results

## 7. Download Trained Models

In [None]:
# Download the trained models
from google.colab import files

# Compress the models directory
!zip -r /content/models.zip /content/models

# Download the compressed file
files.download('/content/models.zip')