# Fake News Detection - Model Development

This notebook covers:
1. Data loading and exploration
2. Text preprocessing
3. Model training
4. Evaluation
5. Model saving

In [23]:
# Install setuptools first
!pip install setuptools wheel

# Install required packages
!pip install pandas numpy scikit-learn nltk matplotlib seaborn

# Download required NLTK data
import nltk

# Download all necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from app.text_processor import TextPreprocessor
from app.model_trainer import ModelTrainer

## 1. Load and Explore Data

In [25]:
# Load dataset
# Try different encodings
try:
    # First try utf-8
    df = pd.read_csv('../data/raw/news_dataset.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        # Try latin-1
        df = pd.read_csv('../data/raw/news_dataset.csv', encoding='latin-1')
    except UnicodeDecodeError:
        # Try cp1252 (Windows-1252)
        df = pd.read_csv('../data/raw/news_dataset.csv', encoding='cp1252')
# Convert Label to binary (TRUE -> 1, FALSE -> 0)
df['Label'] = (df['Label'] == 'TRUE').astype(int)

# Display dataset information
print("\nDataset columns:")
print(df.columns.tolist())
print("\nLabel distribution:")
print(df['Label'].value_counts())
print(f"Dataset shape: {df.shape}")
df.head()


Dataset columns:
['id', 'Statement', 'Image', 'Web', 'Category', 'Date', 'Label']

Label distribution:
Label
1    37800
0    18914
Name: count, dtype: int64
Dataset shape: (56714, 7)


Unnamed: 0,id,Statement,Image,Web,Category,Date,Label
0,2,"WHO praises India's Aarogya Setu app, says it ...",https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,COVID-19,Oct-20,1
1,3,"In Delhi, Deputy US Secretary of State Stephen...",https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,VIOLENCE,Oct-20,1
2,4,LAC tensions: China's strategy behind delibera...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,TERROR,Oct-20,1
3,5,India has signed 250 documents on Space cooper...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,COVID-19,Oct-20,1
4,6,Tamil Nadu chief minister's mother passes away...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,ELECTION,Oct-20,1


## 2. Preprocess Text Data

In [42]:
# Initialize preprocessor
preprocessor = TextPreprocessor()

# Process texts
df = preprocessor.process_data(df, text_column='Statement')

# Save processed data
df.to_csv('../data/processed/cleaned_data.csv', index=False)
print("Preprocessing complete and data saved.")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/codespace/nltk_data'
    - '/home/codespace/.python/current/nltk_data'
    - '/home/codespace/.python/current/share/nltk_data'
    - '/home/codespace/.python/current/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## 3. Train Model

In [None]:
# Initialize trainer
trainer = ModelTrainer()

# Split data
X = df['cleaned_text']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
trainer.train(pd.DataFrame({'cleaned_text': X_train, 'label': y_train}))

# Save model
trainer.save_model('../models/classifier.joblib', '../models/vectorizer.joblib')

## 4. Evaluate Model

In [None]:
# Make predictions on test set
y_pred = [trainer.predict(text)[0] for text in X_test]

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print some example predictions
print("\nExample Predictions:")
for i in range(5):
    text = X_test.iloc[i]
    true_label = y_test.iloc[i]
    pred_label, conf = trainer.predict(text)
    print(f"\nStatement: {text[:100]}...")
    print(f"True Label: {'TRUE' if true_label == 1 else 'FALSE'}")
    print(f"Predicted: {'TRUE' if pred_label == 1 else 'FALSE'} (Confidence: {conf:.2f})")