# BERT/DistilBERT Model Training
## Fake Job Posting Detection

This notebook fine-tunes a BERT/DistilBERT model for fake job posting detection:
- Uses pre-trained DistilBERT (faster than BERT)
- Fine-tunes on our dataset
- Evaluates and compares with baseline model


In [1]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# Set project root
project_root = Path(r'D:\Data 641 NLP\Final Project').resolve()
os.chdir(project_root)
sys.path.append(str(project_root / 'src'))

# Install/verify required packages
import subprocess
import importlib

try:
    import accelerate
    print(f"accelerate version: {accelerate.__version__}")
except ImportError:
    print("Installing accelerate...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "accelerate>=0.26.0", "-q"])
    import accelerate
    print(f"accelerate installed: {accelerate.__version__}")

from bert_model import BERTModel
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


accelerate version: 1.12.0
Using device: cpu


## 1. Load Processed Data


In [2]:
# Load processed data splits
train_df = pd.read_csv('data/processed/train.csv')
val_df = pd.read_csv('data/processed/val.csv')
test_df = pd.read_csv('data/processed/test.csv')

print("Data splits loaded:")
print(f"Train: {len(train_df)} samples")
print(f"  - Real: {len(train_df[train_df['fraudulent']==0])}, Fake: {len(train_df[train_df['fraudulent']==1])}")
print(f"\nValidation: {len(val_df)} samples")
print(f"  - Real: {len(val_df[val_df['fraudulent']==0])}, Fake: {len(val_df[val_df['fraudulent']==1])}")
print(f"\nTest: {len(test_df)} samples")
print(f"  - Real: {len(test_df[test_df['fraudulent']==0])}, Fake: {len(test_df[test_df['fraudulent']==1])}")


Data splits loaded:
Train: 12516 samples
  - Real: 11910, Fake: 606

Validation: 2682 samples
  - Real: 2552, Fake: 130

Test: 2682 samples
  - Real: 2552, Fake: 130


## 2. Prepare Data for Training


In [3]:
# Extract text and labels
X_train = train_df['combined_text']
y_train = train_df['fraudulent']

X_val = val_df['combined_text']
y_val = val_df['fraudulent']

X_test = test_df['combined_text']
y_test = test_df['fraudulent']

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")


Training samples: 12516
Validation samples: 2682
Test samples: 2682


## 3. Initialize BERT Model


In [4]:
# Initialize DistilBERT model (faster than BERT, good performance)
# Can change to 'bert-base-uncased' for better accuracy but slower training
model = BERTModel(model_name='distilbert-base-uncased', num_labels=2)

print("Model initialized: DistilBERT")
print("This will download the pre-trained model on first run.")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized: DistilBERT
This will download the pre-trained model on first run.


In [5]:
# Install/verify required packages
import subprocess
import sys
import importlib

print("Installing/upgrading transformers[torch] and accelerate...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers[torch]", "accelerate>=0.26.0", "-q", "--upgrade"])

# Reload modules
import accelerate
import transformers

print(f"accelerate version: {accelerate.__version__}")
print(f"transformers version: {transformers.__version__}")
print(f"Python executable: {sys.executable}")

# Reload bert_model to pick up the new packages
import importlib
if 'bert_model' in sys.modules:
    importlib.reload(sys.modules['bert_model'])
from bert_model import BERTModel


Installing/upgrading transformers[torch] and accelerate...
accelerate version: 1.12.0
transformers version: 4.57.3
Python executable: d:\Data 641 NLP\Final Project\venv\Scripts\python.exe


## 4. Train BERT Model


## Alternative: Use Google Colab with Free GPU

If training is too slow on your machine, you can:
1. Upload this notebook to Google Colab
2. Enable free GPU: Runtime → Change runtime type → GPU
3. Training will be 10-20x faster (15-30 minutes instead of hours)

To use Colab:
- Upload the notebook and data files
- Install packages in first cell
- Training will be much faster with GPU


In [None]:
# Train the model
# This may take 15-30 minutes depending on your hardware
model.train(
    train_texts=X_train,
    train_labels=y_train,
    val_texts=X_val,
    val_labels=y_val,
    output_dir='data/models/bert',
    num_epochs=3,
    batch_size=16,
    learning_rate=2e-5
)


INFO:bert_model:Preparing training dataset...
INFO:bert_model:Preparing validation dataset...
INFO:bert_model:accelerate version: 1.12.0
INFO:bert_model:Starting training...


Epoch,Training Loss,Validation Loss


## 5. Evaluate on Validation Set


In [None]:
# Evaluate on validation set
print("Validation Set Evaluation:")
val_metrics = model.evaluate(X_val, y_val)

# Confusion matrix visualization
y_val_pred = model.predict(X_val)
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.title('Confusion Matrix - Validation Set (BERT)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()


## 6. Evaluate on Test Set


In [None]:
# Evaluate on test set
print("Test Set Evaluation:")
test_metrics = model.evaluate(X_test, y_test)

# Confusion matrix visualization
y_test_pred = model.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.title('Confusion Matrix - Test Set (BERT)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Real', 'Fake']))


## 7. Compare with Baseline Model


In [None]:
# Load baseline model for comparison
from baseline_model import BaselineModel

baseline = BaselineModel()
baseline.load('data/models')
baseline_test_metrics = baseline.evaluate(X_test, y_test)

# Create comparison
comparison = pd.DataFrame({
    'Baseline (TF-IDF + LR)': [
        baseline_test_metrics['accuracy'],
        baseline_test_metrics['precision'],
        baseline_test_metrics['recall'],
        baseline_test_metrics['f1_score']
    ],
    'BERT/DistilBERT': [
        test_metrics['accuracy'],
        test_metrics['precision'],
        test_metrics['recall'],
        test_metrics['f1_score']
    ]
}, index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])

print("Model Comparison:")
print(comparison.round(4))

# Visualize comparison
comparison.plot(kind='bar', figsize=(10, 6))
plt.title('Model Comparison: Baseline vs BERT')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.xticks(rotation=0)
plt.legend(title='Model')
plt.tight_layout()
plt.show()


## 8. Summary


In [None]:
print("BERT Model Training Complete!")
print(f"\nTest Set Performance:")
print(f"  Accuracy: {test_metrics['accuracy']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  Recall: {test_metrics['recall']:.4f}")
print(f"  F1-Score: {test_metrics['f1_score']:.4f}")

print(f"\nModel saved to: data/models/bert/")
print(f"\nNext Steps:")
print("  1. Analyze errors and misclassifications")
print("  2. Implement impossible jobs detection feature")
print("  3. Build Streamlit dashboard")
print("  4. Final evaluation and report")
