In [14]:
import pandas as pd


df = pd.read_csv('fake_job_postings.csv')
print(df.shape)
df.head()


(17880, 18)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [15]:
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64


In [16]:
df['title'] = df['title'].fillna('')
df['company_profile'] = df['company_profile'].fillna('')
df['description'] = df['description'].fillna('')
df['requirements'] = df['requirements'].fillna('')
df['benefits'] = df['benefits'].fillna('')

In [17]:
df['text'] = df['title'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']


In [18]:
df = df[df['text'].str.strip() != '']


In [19]:

df = df[['text', 'fraudulent']]


In [20]:
print("Label Distribution:\n", df['fraudulent'].value_counts())


Label Distribution:
 fraudulent
0    17014
1      866
Name: count, dtype: int64


In [21]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['fraudulent']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit on training and transform both sets
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [23]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


In [24]:
y_pred = model.predict(X_test_vec)


In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.973434004474273

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      3403
           1       1.00      0.45      0.62       173

    accuracy                           0.97      3576
   macro avg       0.99      0.73      0.80      3576
weighted avg       0.97      0.97      0.97      3576


Confusion Matrix:
 [[3403    0]
 [  95   78]]


In [27]:
suspicious_words = ['no work', 'click here', 'urgent money', 'easy income', '5000k', 'instant job', 'earn easily']

while True:
    print("\nPaste a job description to check if it's fake (or type 'exit'):")
    user_input = input("> ").strip()

    if user_input.lower() == "exit":
        break

    if len(user_input.split()) < 5:
        print("Please provide a complete job description (at least 5 words).")
        continue

    if any(word in user_input.lower() for word in suspicious_words):
        print("Warning: Contains suspicious keywords often used in fake postings.")

    input_vector = vectorizer.transform([user_input])
    prediction = model.predict(input_vector)[0]

    if prediction == 1:
        print("This looks like a FAKE job posting.")
    else:
        print("This seems to be a LEGIT job posting.")



Paste a job description to check if it's fake (or type 'exit'):


>  4k per month. no work.


🔍 Result:  This seems to be a LEGIT job posting.

Paste a job description to check if it's fake (or type 'exit'):


>  exit
