In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import h5py

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/fake_job_postings.csv')

# Drop rows with missing values in the 'description' column
df = df.dropna(subset=['description'])

# Encode labels (0 for real, 1 for fake)
df['fraudulent'] = df['fraudulent'].astype(int)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['description'])
X = tokenizer.texts_to_sequences(df['description'])
X = pad_sequences(X, maxlen=200)  # Pad sequences to a fixed length

# Encode labels
y = df['fraudulent'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the preprocessed dataset in H5 format
with h5py.File('preprocessed_dataset.h5', 'w') as hf:
    hf.create_dataset('X_train', data=X_train)
    hf.create_dataset('X_test', data=X_test)
    hf.create_dataset('y_train', data=y_train)
    hf.create_dataset('y_test', data=y_test)

print("Dataset preprocessed and saved as 'preprocessed_dataset.h5'.")

Dataset preprocessed and saved as 'preprocessed_dataset.h5'.


In [None]:
import h5py

# Load the preprocessed dataset
with h5py.File('/content/preprocessed_dataset.h5', 'r') as hf:
    X_train = hf['X_train'][:]
    X_test = hf['X_test'][:]
    y_train = hf['y_train'][:]
    y_test = hf['y_test'][:]

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Dropout

# Define the model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/10




[1m447/447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.9393 - loss: 0.2702 - val_accuracy: 0.9491 - val_loss: 0.1644
Epoch 2/10
[1m447/447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9506 - loss: 0.1522 - val_accuracy: 0.9656 - val_loss: 0.1165
Epoch 3/10
[1m447/447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9673 - loss: 0.1005 - val_accuracy: 0.9712 - val_loss: 0.1034
Epoch 4/10
[1m447/447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9804 - loss: 0.0588 - val_accuracy: 0.9746 - val_loss: 0.0966
Epoch 5/10
[1m447/447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9883 - loss: 0.0408 - val_accuracy: 0.9765 - val_loss: 0.1060
Epoch 6/10
[1m447/447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9932 - loss: 0.0254 - val_accuracy: 0.9785 - val_loss: 0.1053
Epoch 7/10
[1m447/447[0m [32m━━━━━━━

In [None]:
model.save('fake_job_detection_model.h5')



In [None]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model('fake_job_detection_model.h5')

# Predict on new data
new_data = ["This is a fake job posting with unrealistic salary claims."]
new_data_seq = tokenizer.texts_to_sequences(new_data)
new_data_padded = pad_sequences(new_data_seq, maxlen=200)

prediction = model.predict(new_data_padded)
print("Fake" if prediction > 0.5 else "Real")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step
Real


In [None]:
# Install required libraries
!pip install gradio
!pip install tensorflow
!pip install pytesseract
!apt install tesseract-ocr
!pip install pillow
!pip install fuzzywuzzy

# Import libraries
import gradio as gr
import tensorflow as tf
from PIL import Image
import pytesseract
import numpy as np
import cv2
from fuzzywuzzy import fuzz

# Load the model (replace with your model path)
model = tf.keras.models.load_model('/content/drive/MyDrive/fake_job_slayer_model.h5')

# Constants
img_height, img_width = 128, 128

# Job-related keywords (expanded list)
job_keywords = [
    "job", "vacancy", "hiring", "join our team", "requirements", "salary", "location",
    "company", "department", "employment", "experience", "education", "marketing", "staff"
]

# Fake job indicators (e.g., unrealistic salaries, vague descriptions)
fake_job_indicators = [
    "earn $", "work from home", "no experience", "urgent hiring", "pay a fee", "registration fee",
    "unlimited income", "get rich quick", "immediate start", "no interview"
]

# Preprocess image
def preprocess_image(image: Image.Image) -> np.ndarray:
    """Convert image to RGB, resize, and normalize."""
    if image.mode != "RGB":
        image = image.convert("RGB")
    image = image.resize((img_height, img_width))
    image = np.array(image) / 255.0  # Normalize to [0, 1]
    return image

# Extract text from image using pytesseract
def extract_text(image: Image.Image) -> str:
    """Extract text from any image format."""
    # Convert to grayscale for better OCR accuracy
    gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    # Apply adaptive thresholding to enhance text
    thresh_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    # Use pytesseract to extract text
    custom_config = r'--oem 3 --psm 6'  # OCR Engine Mode and Page Segmentation Mode
    extracted_text = pytesseract.image_to_string(thresh_image, config=custom_config)
    return extracted_text.lower()  # Convert to lowercase for easier matching

# Check if text contains job-related keywords
def is_job_related(text: str) -> bool:
    """Use fuzzy matching to check for job-related keywords."""
    for keyword in job_keywords:
        if fuzz.partial_ratio(keyword, text) > 80:  # Fuzzy matching with 80% threshold
            return True
    return False

# Check if text contains fake job indicators
def is_fake_job(text: str) -> bool:
    """Check for fake job indicators in the extracted text."""
    for indicator in fake_job_indicators:
        if indicator in text:
            return True
    return False

# Predict job authenticity
def predict_job(image: Image.Image):
    # Extract text from the image
    extracted_text = extract_text(image)
    print("Extracted Text:", extracted_text)  # Debugging: Print extracted text

    # Check if the text contains job-related keywords
    if not is_job_related(extracted_text):
        return "Invalid Image - No job-related text detected."

    # Check for fake job indicators
    if is_fake_job(extracted_text):
        return "Fake Job"

    # Preprocess the image for the model
    processed_image = preprocess_image(image)
    processed_image = np.expand_dims(processed_image, axis=0)  # Add batch dimension

    # Predict
    prediction = model.predict(processed_image)
    return "Real Job" if prediction[0] < 0.5 else "Fake Job"

# Gradio Interface
interface = gr.Interface(
    fn=predict_job,
    inputs=gr.Image(type="pil", label="Upload Job Post Image"),
    outputs="text",
    title="Fake Job Slayer",
    description="""
    <div style="position: relative; z-index: 1; text-align: center; padding-top: 20px;">
        <h1 style="font-size: 3em; background: linear-gradient(60deg, #ff004d, #ffa200, #2a9d8f, #e76f51); -webkit-background-clip: text; color: transparent;">
            Fake Job Slayer
        </h1>
        <p>Upload a job post image (JPEG, PNG, etc.) to verify its authenticity.</p>
    </div>
    """,
    css="""
    body, .gradio-container {
        background: linear-gradient(150deg, #2a9d8f, #ff004d, #ffa200, #2a9d8f);
        background-size: 300% 300%;
        animation: gradientShift 10s ease infinite;
        color: #FFF;
        font-family: 'Roboto', sans-serif;
        font-size: 16px;
    }
    input[type="file"], .gr-button {
        background: linear-gradient(90deg, #ff004d, #ffa200, #2a9d8f);
        background-size: 200% 200%;
        animation: gradientShift 4s ease infinite;
        border: 3px solid #ffd700;
        border-radius: 50px;
        padding: 15px 30px;
        color: #FFF;
        transition: all 0.4s ease;
        font-size: 18px;
        font-weight: bold;
        text-transform: uppercase;
        position: relative;
        overflow: hidden;
        z-index: 1;
    }
    input[type="file"]:hover, .gr-button:hover {
        color: #ffd700;
        box-shadow: 0 0 30px rgba(255, 215, 0, 0.7);
        transform: scale(1.05);
    }
    h1 {
        font-size: 3em;
        background: linear-gradient(60deg, #ff004d, #ffa200, #2a9d8f, #e76f51);
        -webkit-background-clip: text;
        color: transparent;
    }
    p {
        font-size: 1.3em;
        padding-top: 10px;
    }
    .gradio-container {
        border-radius: 25px;
        padding: 40px;
        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.7);
    }
    """,
)

# Launch the interface
interface.launch()

Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ae206fde65fdb6cd4b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


