In [1]:
# 1. Import Libraries
import pandas as pd
import os

In [2]:
# 2. Load Raw Dataset
raw_data_path = '../data/fake_job_postings.csv'  # adjust if needed
df = pd.read_csv(raw_data_path)

# 3. Inspect Basic Info
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())

Shape: (17880, 18)
Missing values:
 job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64


In [3]:

# 4. Drop low-value or irrelevant columns
drop_cols = ['telecommuting', 'has_company_logo', 'has_questions', 'salary_range']
df.drop(columns=drop_cols, inplace=True)

# 5. Fill missing text fields with empty strings
text_cols = ['company_profile', 'description', 'requirements', 'benefits']
df[text_cols] = df[text_cols].fillna('')

In [4]:
# 6. Combine text columns into one 'text' column
df['text'] = (
    df['title'] + ' ' +
    df['company_profile'] + ' ' +
    df['description'] + ' ' +
    df['requirements'] + ' ' +
    df['benefits']
)

# 7. Drop rows with empty 'text'
df['text'] = df['text'].str.strip()
df = df[df['text'].astype(bool)]

# 8. Keep only 'text' and 'fraudulent' columns
df_cleaned = df[['text', 'fraudulent']]

In [5]:
# 9. Save cleaned data
cleaned_data_path = '../data/cleaned_data_jobs.csv'
os.makedirs(os.path.dirname(cleaned_data_path), exist_ok=True)
df_cleaned.to_csv(cleaned_data_path, index=False)

print(f"✅ Cleaned data saved to: {cleaned_data_path}")
print("Shape of cleaned data:", df_cleaned.shape)

# 10. Optional: Preview cleaned data
df_cleaned.sample(5)

✅ Cleaned data saved to: ../data/cleaned_data_jobs.csv
Shape of cleaned data: (17880, 2)


Unnamed: 0,text,fraudulent
16583,Outside sales representative With over 200 emp...,0
3799,Oracle DBA PROSULTING IS PEOPLE The ProSulting...,0
10872,Hardware Systems Design Engineer (US) PowerbyP...,0
8039,Senior C# Developer Industrial Color Software ...,0
3465,Functional Writer Located in the heart of down...,0


In [6]:
# 1. Imports
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from joblib import dump

# 2. Download NLTK resources (only runs once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 3. Load cleaned data
df = pd.read_csv('../data/cleaned_data_jobs.csv')  # <- Adjust path if needed
print("✅ Loaded data. Shape:", df.shape)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\palre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Loaded data. Shape: (17880, 2)
