In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_csv(r"C:/Users/shyamrao/Desktop/fake job/fake_job_postings.csv")

In [4]:
data.shape

(17880, 18)

In [5]:
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [6]:
data.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [7]:
data.interpolate(inplace=True)

In [8]:
data.shape

(17880, 18)

In [9]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

In [10]:
text_fields = [
    "title", "location", "department", "company_profile", "description",
    "requirements", "benefits", "required_experience", "required_education"
]

In [11]:
data["combined_text"] = data[text_fields].apply(lambda x: " ".join(x.fillna("")), axis=1)

In [12]:

data["combined_text"] = data["combined_text"].apply(clean_text)

In [13]:
label_encoder = LabelEncoder()
data["employment_type"] = label_encoder.fit_transform(data["employment_type"])
data["industry"] = label_encoder.fit_transform(data["industry"])
data["function"] = label_encoder.fit_transform(data["function"])

In [14]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_text = tfidf.fit_transform(data["combined_text"])

In [15]:
X_numerical = data[["employment_type", "industry", "function"]]
X = np.hstack((X_text.toarray(), X_numerical))

In [16]:
y = label_encoder.fit_transform(data["fraudulent"])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
models = {
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1),
    "SVM": SVC(kernel="linear", probability=True)
}

In [117]:
model_accuracies = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    model_accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")



Logistic Regression Accuracy: 0.95
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      3403
           1       0.00      0.00      0.00       173

    accuracy                           0.95      3576
   macro avg       0.48      0.50      0.49      3576
weighted avg       0.91      0.95      0.93      3576



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Accuracy: 0.97
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3403
           1       1.00      0.29      0.45       173

    accuracy                           0.97      3576
   macro avg       0.98      0.64      0.72      3576
weighted avg       0.97      0.97      0.96      3576

SVM Accuracy: 0.98
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       0.98      0.68      0.80       173

    accuracy                           0.98      3576
   macro avg       0.98      0.84      0.90      3576
weighted avg       0.98      0.98      0.98      3576



In [None]:
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_model = models[best_model_name]
print(f"Best model selected: {best_model_name} with accuracy {model_accuracies[best_model_name]:.2f}")

In [82]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
def predict_job(job_data):
    
    combined_text = " ".join(
        [
            clean_text(job_data.get("title", "")),
            clean_text(job_data.get("company_profile", "")),
            clean_text(job_data.get("description", "")),
            clean_text(job_data.get("requirements", "")),
            clean_text(job_data.get("benefits", "")),
            clean_text(job_data.get("required_education", "")),
            clean_text(job_data.get("required_experience", "")),
            clean_text(job_data.get("location", "")),
            clean_text(job_data.get("department", "")),
        ]
    )
    
    # Transform text using TF-IDF
    text_features = tfidf.transform([combined_text]).toarray()

    # Encode numerical features
    try:
        # Encode numerical features
        numerical_features = np.array([
            label_encoder.transform([job_data.get("employment_type", "0")])[0] if np.isin(job_data.get("employment_type", "0"), label_encoder.classes_) else -1,
            label_encoder.transform([job_data.get("industry", "0")])[0] if np.isin(job_data.get("industry", "0"), label_encoder.classes_) else -1,
            label_encoder.transform([job_data.get("function", "0")])[0] if np.isin(job_data.get("function", "0"), label_encoder.classes_) else -1,
        ]).reshape(1, -1)
    except ValueError as e:
        print(f"Error: {e}")
        return None
    
    # Combine features
    features = np.hstack((text_features, numerical_features))

    # Predict using each model
    predictions = {name: model.predict(features)[0] for name, model in models.items()}
    
    return predictions

In [111]:
def is_fake_job(job_data):
    """
    Checks if a job posting meets conditions to be classified as fake.
    Returns True for fake jobs, False otherwise.
    """
    # 2. Check for no experience required in senior-level positions
    title = job_data.get("title", "").lower()
    required_experience = job_data.get("required_experience", "").lower()
    if "senior" in title and "no experience" in required_experience:
        return True, "Senior role with no experience required"

    # 3. Mismatched job title and requirements
#     requirements = job_data.get("requirements", "").lower()
#     industry = job_data.get("industry", "").lower()
#     if industry and not any(keyword in requirements for keyword in industry.split()):
#         return True, "Requirements do not match industry"

    # 4. Company profile missing or vague
    company_profile = job_data.get("company_profile", "").strip()
    if not company_profile or len(company_profile) < 20:
        return True, "Vague or missing company profile"

    # 5. Suspicious or overly generic job description
    description = job_data.get("description", "").strip().lower()
    if len(description) < 50 or "easy income" in description or "get rich quick" in description:
        return True, "Suspicious or overly generic description"

    # 6. Location issues
    location = job_data.get("location", "").strip().lower()
    if not location or location == "worldwide":
        return True, "Unspecified or vague location"


    # 8. Upfront payments requested
    if "fee" in description or "payment required" in description:
        return True, "Upfront payments requested"

    # 9. Overuse of buzzwords
    buzzwords = ["easy money", "unlimited bonuses", "no targets"] 
    if any(buzzword in description for buzzword in buzzwords):
        return True, "Overuse of buzzwords"

    # 10. Mismatched experience and salary
    salary = job_data.get("salary", 0)
    high_salary_threshold = 200000
    if int(salary) > high_salary_threshold and "no experience" in required_experience:
        return True, "High salary and no experience required"

    # If no conditions are met, the job is likely real
    return False, "Real job"

In [104]:
new_job = {
    "title": "Software Engineer",
    "company_profile": "We are a leading tech company.",
    "description": "Develop and maintain software solutions.",
    "requirements": "Python, Machine Learning, APIs",
    "benefits": "Health insurance, paid time off",
    "employment_type": "Full-time",
    "industry": "Information Technology",
    "function": "Engineering",
}


In [106]:
is_fake, reason = is_fake_job(new_job)

# Print the result
if is_fake:
    print("Fake",reason)
else:
    predictions = predict_job(new_job)
    print("Predictions for the new job:")
    for model_name, prediction in predictions.items():
        print(f"{model_name}: {'Fake' if prediction == 1 else 'Real'}")


Fake Suspicious or overly generic description


In [107]:
new_job = {
    "title": "Software Engineer",
    "company_profile": "We are a leading tech company.",
    "description": "Develop and maintain software solutions.",
    "requirements": "sockets,electric devices",
    "benefits": "Health insurance, paid time off",
    "employment_type": "Full-time",
    "industry": "Electronic Engineering",
    "function": "Engineering",
    "location": "India",
}

In [108]:
is_fake, reason = is_fake_job(new_job)

# Print the result
if is_fake:
    print("Fake",reason)
else:
    predictions = predict_job(new_job)
    print("Predictions for the new job:")
    for model_name, prediction in predictions.items():
        print(f"{model_name}: {'Fake' if prediction == 1 else 'Real'}")

Fake Suspicious or overly generic description


In [88]:
predictions = predict_job(new_job)
print("Predictions for the new job:")
for model_name, prediction in predictions.items():
    print(f"{model_name}: {'Fake' if prediction == 1 else 'Real'}")

Predictions for the new job:
Logistic Regression: Real
Random Forest: Real
SVM: Real


  mask |= (ar1 == a)


In [112]:
new_job = {
    "title": "Data Entry Specialist - Immediate Hiring",
    "company_profile": "We are a rapidly expanding global company offering easy remote work opportunities.",
    "description": "Join us today and start earning immediately with simple data entry tasks from the comfort of your home. No experience necessary!",
    "requirements": "Basic computer knowledge, must have a laptop or smartphone, and be available for 24/7 work.",
    "benefits": "Earn up to $3,000/week, work from home, flexible hours, no interviews.",
    "employment_type": "Part-time, Contract",
    "industry": "Online Services",
    "function": "Data Entry",
    "location": "Remote",
}


In [113]:
is_fake, reason = is_fake_job(new_job)

# Print the result
if is_fake:
    print("Fake",reason)
else:
    predictions = predict_job(new_job)
    print("Predictions for the new job:")
    for model_name, prediction in predictions.items():
        print(f"{model_name}: {'Fake' if prediction == 1 else 'Real'}")

Predictions for the new job:
Logistic Regression: Real
Random Forest: Real
SVM: Fake


  mask |= (ar1 == a)


In [92]:
new_job = {
    "title": "Software Engineer - Work from Home",
    "company_profile": "We are a growing global company offering remote work opportunities in tech.",
    "description": "We are looking for a skilled software engineer to join our team. You will work on cutting-edge software projects from home.",
    "requirements": "Experience with farming equipment, knowledge of irrigation systems, and agricultural machinery.",
    "benefits": "Flexible working hours, opportunity to work from home, and a competitive salary.",
    "employment_type": "Full-time",
    "industry": "Software Development",
    "function": "Engineering",
    "location": "Remote",
}


In [93]:
predictions = predict_job(new_job)
print("Predictions for the new job:")
for model_name, prediction in predictions.items():
    print(f"{model_name}: {'Fake' if prediction == 1 else 'Real'}")

Predictions for the new job:
Logistic Regression: Real
Random Forest: Real
SVM: Real


  mask |= (ar1 == a)
