# Data Preprocessing and Feature Engineering
## Fake Job Posting Detection

This notebook handles:
- Text cleaning
- Missing value handling
- Feature engineering
- Class imbalance handling
- Train/validation/test splits


In [23]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path

# Get project root - use absolute path to project directory
project_root = Path(r'D:\Data 641 NLP\Final Project').resolve()
os.chdir(project_root)

print(f"Project root: {project_root}")
print(f"Current working directory: {os.getcwd()}")

# Add src directory to path
sys.path.append(str(project_root / 'src'))
from utils import clean_text, combine_text_features, calculate_text_statistics
from data_preprocessing import DataPreprocessor

import warnings
warnings.filterwarnings('ignore')


Project root: D:\Data 641 NLP\Final Project
Current working directory: D:\Data 641 NLP\Final Project


## 1. Load and Explore Raw Data


In [24]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Use absolute path to ensure it works
data_path = Path(project_root) / 'data' / 'raw' / 'fake_job_postings.csv'
print(f"Loading data from: {data_path}")
print(f"File exists: {data_path.exists()}")

if not data_path.exists():
    # Try alternative path
    alt_path = Path('D:/Data 641 NLP/Final Project/data/raw/fake_job_postings.csv')
    print(f"\nTrying alternative path: {alt_path}")
    print(f"Alternative exists: {alt_path.exists()}")
    if alt_path.exists():
        data_path = alt_path

df = preprocessor.load_raw_data(str(data_path))

print(f"\nDataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['fraudulent'].value_counts().sort_index())
print(f"\nClass percentages:")
print(df['fraudulent'].value_counts(normalize=True).sort_index() * 100)


Loading data from: D:\Data 641 NLP\Final Project\data\raw\fake_job_postings.csv
File exists: True


INFO:utils:Loaded 17880 rows from D:\Data 641 NLP\Final Project\data\raw\fake_job_postings.csv
INFO:data_preprocessing:Loaded 17880 rows



Dataset shape: (17880, 18)

Class distribution:
fraudulent
0    17014
1      866
Name: count, dtype: int64

Class percentages:
fraudulent
0    95.1566
1     4.8434
Name: proportion, dtype: float64


## 2. Preprocess Data


In [25]:
# Run preprocessing pipeline
processed_df = preprocessor.preprocess()

print(f"Processed dataset shape: {processed_df.shape}")
print(f"\nNew columns created: {set(processed_df.columns) - set(df.columns)}")


INFO:data_preprocessing:Starting preprocessing...
INFO:data_preprocessing:Cleaned title
INFO:data_preprocessing:Cleaned description
INFO:data_preprocessing:Cleaned company_profile
INFO:data_preprocessing:Cleaned requirements
INFO:data_preprocessing:Cleaned benefits
INFO:data_preprocessing:Preprocessing complete!


Processed dataset shape: (17880, 27)

New columns created: {'text_avg_sentence_length', 'text_word_count', 'has_requirements', 'text_char_count', 'has_benefits', 'text_avg_word_length', 'text_sentence_count', 'combined_text', 'has_company_profile'}


## 3. Handle Class Imbalance


In [26]:
# Check class distribution after preprocessing
print("Class distribution before handling imbalance:")
print(processed_df['fraudulent'].value_counts().sort_index())

# We'll handle imbalance using:
# 1. Class weights in models (already in baseline_model.py)
# 2. Stratified splits (ensures balanced splits)
# 3. Optional: Oversampling for training set

from sklearn.utils import resample

# Separate majority and minority classes
df_real = processed_df[processed_df['fraudulent'] == 0]
df_fake = processed_df[processed_df['fraudulent'] == 1]

print(f"\nReal samples: {len(df_real)}")
print(f"Fake samples: {len(df_fake)}")
print(f"Imbalance ratio: {len(df_real)/len(df_fake):.2f}:1")


Class distribution before handling imbalance:
fraudulent
0    17014
1      866
Name: count, dtype: int64

Real samples: 17014
Fake samples: 866
Imbalance ratio: 19.65:1


## 4. Split Data into Train/Validation/Test Sets


In [27]:
# Split data with stratification to maintain class distribution
train_df, val_df, test_df = preprocessor.split_data(
    test_size=0.15,
    val_size=0.15,
    random_state=42
)

print("Split Summary:")
print(f"Train: {len(train_df)} samples")
print(f"  - Real: {len(train_df[train_df['fraudulent']==0])}, Fake: {len(train_df[train_df['fraudulent']==1])}")
print(f"\nValidation: {len(val_df)} samples")
print(f"  - Real: {len(val_df[val_df['fraudulent']==0])}, Fake: {len(val_df[val_df['fraudulent']==1])}")
print(f"\nTest: {len(test_df)} samples")
print(f"  - Real: {len(test_df[test_df['fraudulent']==0])}, Fake: {len(test_df[test_df['fraudulent']==1])}")


INFO:data_preprocessing:Train set: 12516 rows
INFO:data_preprocessing:Validation set: 2682 rows
INFO:data_preprocessing:Test set: 2682 rows


Split Summary:
Train: 12516 samples
  - Real: 11910, Fake: 606

Validation: 2682 samples
  - Real: 2552, Fake: 130

Test: 2682 samples
  - Real: 2552, Fake: 130


## 5. Optional: Oversample Minority Class for Training


In [28]:
# Optionally oversample fake samples in training set
# This increases fake samples from ~650 to match real samples
# Note: We'll also use class weights in models, so this is optional

oversample = False  # Set to True to enable oversampling

if oversample:
    train_real = train_df[train_df['fraudulent'] == 0]
    train_fake = train_df[train_df['fraudulent'] == 1]
    
    # Upsample minority class
    train_fake_upsampled = resample(
        train_fake,
        replace=True,
        n_samples=len(train_real),
        random_state=42
    )
    
    # Combine majority class with upsampled minority class
    train_df_balanced = pd.concat([train_real, train_fake_upsampled])
    train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"After oversampling:")
    print(f"Train: {len(train_df_balanced)} samples")
    print(f"  - Real: {len(train_df_balanced[train_df_balanced['fraudulent']==0])}, Fake: {len(train_df_balanced[train_df_balanced['fraudulent']==1])}")
    train_df = train_df_balanced
else:
    print("Oversampling disabled. Will use class weights in models instead.")


Oversampling disabled. Will use class weights in models instead.


## 6. Save Processed Data


In [29]:
# Save processed data splits
preprocessor.save_processed_data(train_df, val_df, test_df)

print("Processed data saved to data/processed/")
print("Files created:")
print("  - train.csv")
print("  - val.csv")
print("  - test.csv")


INFO:utils:Saved 12516 rows to data/processed/train.csv
INFO:utils:Saved 2682 rows to data/processed/val.csv
INFO:utils:Saved 2682 rows to data/processed/test.csv
INFO:data_preprocessing:Processed data saved successfully!


Processed data saved to data/processed/
Files created:
  - train.csv
  - val.csv
  - test.csv


## 7. Summary


In [30]:
print("Preprocessing Complete!")
print(f"\nOriginal dataset: {len(df):,} samples")
print(f"Processed dataset: {len(processed_df):,} samples")
print(f"\nData splits:")
print(f"  Train: {len(train_df):,} samples")
print(f"  Validation: {len(val_df):,} samples")
print(f"  Test: {len(test_df):,} samples")
print(f"\nNext steps:")
print("  1. Train baseline model (TF-IDF + Logistic Regression)")
print("  2. Fine-tune BERT model")
print("  3. Evaluate and compare models")


Preprocessing Complete!

Original dataset: 17,880 samples
Processed dataset: 17,880 samples

Data splits:
  Train: 12,516 samples
  Validation: 2,682 samples
  Test: 2,682 samples

Next steps:
  1. Train baseline model (TF-IDF + Logistic Regression)
  2. Fine-tune BERT model
  3. Evaluate and compare models
