In [1]:
# 1. Imports
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from joblib import dump

# 2. Download NLTK resources (only runs once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 3. Load cleaned data
df = pd.read_csv('../data/cleaned_data_jobs.csv')  # <- Adjust path if needed
print("✅ Loaded data. Shape:", df.shape)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\palre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Loaded data. Shape: (17880, 2)


In [2]:
# 5. Define NLP cleaning function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove punctuation and digits
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens
              if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

In [4]:
# 6. Apply NLP preprocessing
df['clean_text'] = df['text'].apply(clean_text)

# 7. Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text'])
y = df['fraudulent']

# 8. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("✅ TF-IDF complete. Train shape:", X_train.shape)

✅ TF-IDF complete. Train shape: (14304, 5000)


In [6]:
from pathlib import Path
base_dir = Path.cwd().parent

In [7]:
# 9. Save vectorizer and splits
model_dir = base_dir / 'models'
processed_dir = base_dir / 'data' / 'processed'
os.makedirs(model_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

dump(tfidf, model_dir / 'tfidf_vectorizer.joblib')
dump((X_train, X_test, y_train, y_test), processed_dir / 'train_test_split.joblib')

print("✅ TF-IDF vectorizer and train/test splits saved.")

✅ TF-IDF vectorizer and train/test splits saved.
