# Fake News Analysis and Classification

This notebook performs exploratory data analysis (EDA) on the fake news dataset and evaluates different classification models.

In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the parent directory to the path to import the models module
sys.path.append('..')
from models import load_data, LogisticModel, DistilBertModel

## 1. Load and Explore the Dataset

In [None]:
# Load the dataset
data = load_data('../data/fake_news_dataset.csv')
df = data['full_data']

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:\n{df['label'].value_counts()}")
print(f"\nClass distribution (percentage):\n{df['label'].value_counts(normalize=True) * 100}")

In [None]:
# Display sample data
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Text length distribution
df['text_length'] = df['text'].apply(len)

plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='text_length', hue='label', bins=50, kde=True)
plt.title('Text Length Distribution by Class')
plt.xlabel('Text Length')
plt.ylabel('Count')
plt.legend(['Real News', 'Fake News'])
plt.show()

In [None]:
# 1. Check for missing or null values
print("Checking for missing values:")
missing_values = df.isnull().sum()
print(missing_values)
    
# 2. Check label distribution and ensure binary classes
print("\nLabel distribution before cleaning:")
print(df['label'].value_counts())
print("\nUnique labels:", df['label'].unique())

# 3. Check for duplicates
duplicate_count = df.duplicated(['text', 'label']).sum()
print(f"\nNumber of duplicate entries (same text and label): {duplicate_count}")

# Clean the data
# Remove missing values
df = df.dropna()

# Ensure binary labels (0 for Real, 1 for Fake)
if df['label'].dtype == object:
    label_map = {'Real': 0, 'Fake': 1}
    df['label'] = df['label'].map(label_map)

# Remove duplicates
df = df.drop_duplicates(subset=['text', 'label'])

# Show cleaned data stats
print("\nAfter cleaning:")
print(f"Total samples: {len(df)}")
print("\nLabel distribution after cleaning:")
print(df['label'].value_counts())



## 3. Train and Evaluate Logistic Regression Model

In [None]:
# Initialize and train the Logistic Regression model
logistic_model = LogisticModel(max_features=10000, preprocess=True)
logistic_model.train(data['train']['texts'], data['train']['labels'])

# Convert string labels to numeric
if data['train']['labels'].dtype == object:  # Check if labels are strings
    # Map 'Real' to 0 and 'Fake' to 1
    data['train']['labels'] = data['train']['labels'].map({'Real': 0, 'Fake': 1})
    data['test']['labels'] = data['test']['labels'].map({'Real': 0, 'Fake': 1})

# Initialize and train the Logistic Regression model
logistic_model = LogisticModel(max_features=10000, preprocess=True)
logistic_model.train(data['train']['texts'], data['train']['labels'])



# Evaluate the model
logistic_metrics = logistic_model.evaluate(data['test']['texts'], data['test']['labels'])

# Display metrics
print("Logistic Regression Model Metrics:")
for metric, value in logistic_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric.capitalize()}: {value:.4f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    logistic_metrics['confusion_matrix'],
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Real', 'Fake'],
    yticklabels=['Real', 'Fake']
)
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Save the model
logistic_model.save('../models/logistic_model.pkl')

## 4. Train and Evaluate DistilBERT Model

In [None]:
# Initialize and train the DistilBERT model
# Note: This may take some time to run
distilbert_model = DistilBertModel(max_length=128, batch_size=16, epochs=2)
distilbert_model.train(data['train']['texts'], data['train']['labels'])

# Evaluate the model
distilbert_metrics = distilbert_model.evaluate(data['test']['texts'], data['test']['labels'])

# Display metrics
print("DistilBERT Model Metrics:")
for metric, value in distilbert_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric.capitalize()}: {value:.4f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    distilbert_metrics['confusion_matrix'],
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Real', 'Fake'],
    yticklabels=['Real', 'Fake']
)
plt.title('Confusion Matrix - DistilBERT')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Save the model
distilbert_model.save('../models/distilbert_model')

## 5. Compare Model Performance

In [None]:
# Compare model performance
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
models = ['Logistic Regression', 'DistilBERT']
performance = {
    'Logistic Regression': [logistic_metrics[metric] for metric in metrics],
    'DistilBERT': [distilbert_metrics[metric] for metric in metrics]
}

# Create a DataFrame for comparison
performance_df = pd.DataFrame(performance, index=metrics)
performance_df.index = [metric.capitalize() for metric in metrics]

# Display the comparison
performance_df

In [None]:
# Plot the comparison
performance_df.plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Comparison')
plt.xlabel('Metric')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.legend(title='Model')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

## 6. Example Predictions

In [None]:
# Get some examples from the test set
examples = data['test']['texts'].iloc[:5].tolist()
true_labels = data['test']['labels'].iloc[:5].tolist()

# Make predictions with both models
logistic_preds = logistic_model.predict(examples)
distilbert_preds = distilbert_model.predict(examples)

# Display the results
results = pd.DataFrame({
    'Text': [text[:100] + '...' for text in examples],
    'True Label': true_labels,
    'Logistic Prediction': logistic_preds,
    'DistilBERT Prediction': distilbert_preds
})

results