In [8]:
import pandas as pd

# Load datasets
df_fake = pd.read_csv("datasets/fake_jobs.csv")  # Replace with actual paths
df_real = pd.read_csv("datasets/mixed_jobs.csv")  # Replace with actual paths


# Fill missing telecommuting values with an empty string for fake job postings
df_fake['telecommuting'] = df_fake['telecommuting'].astype(str).fillna("")
df_real['telecommuting'] = df_real['telecommuting'].replace({1: "remote", 0: "on-site"})

df_real['text'] = df_real['title'].fillna("").astype(str) + " " + df_real['description'].fillna("").astype(str) + " " + df_real['requirements'].fillna("").astype(str) + " " + df_real['company_profile'].fillna("").astype(str) + " " + df_real['telecommuting'].fillna("").astype(str) + df_real['location'].fillna("").astype(str) + df_real['salary_range'].fillna("").astype(str) + df_real['employment_type'].fillna("").astype(str) + df_real['industry'].fillna("").astype(str) + df_real['benefits'].fillna("").astype(str)
df_fake['text'] = df_fake['title'].fillna("").astype(str) + " " + df_fake['description'].fillna("").astype(str) + " " + df_fake['requirements'].fillna("").astype(str) + " " + df_fake['company_profile'].fillna("").astype(str) + " " + df_fake['telecommuting'].fillna("").astype(str) + df_fake['location'].fillna("").astype(str) + df_fake['salary_range'].fillna("").astype(str) + df_fake['employment_type'].fillna("").astype(str) + df_fake['industry'].fillna("").astype(str) + df_fake['benefits'].fillna("").astype(str)
    
df_combined = pd.concat([df_real, df_fake], ignore_index=True)
df_combined['fraudulent'] = df_combined['fraudulent'].astype(int)

# Combine fake and real datasets
df_combined = pd.concat([df_real, df_fake], ignore_index=True)
df_combined['fraudulent'] = df_combined['fraudulent'].astype(int)

# Preview the data
df_combined[['text', 'fraudulent']].head()

Unnamed: 0,text,fraudulent
0,"Marketing Intern Food52, a fast-growing, James...",0
1,Customer Service - Cloud Video Production Orga...,0
2,Commissioning Machinery Assistant (CMA) Our cl...,0
3,Account Executive - Washington DC THE COMPANY:...,0
4,Bill Review Manager JOB TITLE: Itemization Rev...,0


In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [10]:
# Encode labels
X = df_combined['text'].values
y = df_combined['fraudulent'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Tokenize text data
MAX_NUM_WORDS = 10000  # Vocabulary size
MAX_SEQUENCE_LENGTH = 200  # Max words per job posting

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")


In [44]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,)),
    tf.keras.layers.Embedding(input_dim=MAX_NUM_WORDS, output_dim=128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification (0 or 1)
])
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

In [48]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions
predictions = model.predict(X_test_pad)

Epoch 1/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 57ms/step - accuracy: 0.9603 - loss: 0.2098 - val_accuracy: 0.9837 - val_loss: 0.0726
Epoch 2/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 57ms/step - accuracy: 0.9879 - loss: 0.0534 - val_accuracy: 0.9853 - val_loss: 0.0580
Epoch 3/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 59ms/step - accuracy: 0.9944 - loss: 0.0256 - val_accuracy: 0.9874 - val_loss: 0.0655
Epoch 4/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 61ms/step - accuracy: 0.9972 - loss: 0.0138 - val_accuracy: 0.9873 - val_loss: 0.0655
Epoch 5/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 65ms/step - accuracy: 0.9989 - loss: 0.0085 - val_accuracy: 0.9864 - val_loss: 0.0638
Epoch 6/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 68ms/step - accuracy: 0.9994 - loss: 0.0052 - val_accuracy: 0.9860 - val_loss: 0.0727
Epoch 7/10
[1m6

In [54]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Evaluate the model using the test data
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions on the test data
predictions = model.predict(X_test_pad)

# Convert predictions to binary (0 or 1) as we're performing binary classification
predictions = (predictions > 0.5).astype(int)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

# Print the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

# Alternatively, print a classification report which includes all metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))


[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.9854 - loss: 0.0817
Test Accuracy: 0.9871
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step
Precision: 0.9905
Recall: 0.9757
F1-Score: 0.9831

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3432
           1       0.99      0.98      0.98      2144

    accuracy                           0.99      5576
   macro avg       0.99      0.98      0.99      5576
weighted avg       0.99      0.99      0.99      5576



In [58]:
model.save('job_posting_model.keras')