In [2]:
# Day 2: Understanding and Loading the Dataset
 
import pandas as pd
 
# Load dataset (after downloading from Kaggle)

df = pd.read_csv('Fake_Jobs_Postings.csv')
 
# Display first few rows

print("Sample Data:")

print(df.head())
 
# Display basic info

print("\nDataset Info:")

print(df.info())
 
# Check for missing values

print("\nMissing Values per Column:")

print(df.isnull().sum())
 
# Check distribution of target variable

print("\nTarget (fraudulent) Distribution:")

print(df['fraudulent'].value_counts())
 
# Basic statistics

print("\nDataset Summary:")

print(df.describe(include='all'))

 

Sample Data:
   job_id                                      title            location  \
0       1                           Marketing Intern    US, NY, New York   
1       2  Customer Service - Cloud Video Production      NZ, , Auckland   
2       3    Commissioning Machinery Assistant (CMA)       US, IA, Wever   
3       4          Account Executive - Washington DC  US, DC, Washington   
4       5                        Bill Review Manager  US, FL, Fort Worth   

  department salary_range                                    company_profile  \
0  Marketing          NaN  We're Food52, and we've created a groundbreaki...   
1    Success          NaN  90 Seconds, the worlds Cloud Video Production ...   
2        NaN          NaN  Valor Services provides Workforce Solutions th...   
3      Sales          NaN  Our passion for improving quality of life thro...   
4        NaN          NaN  SpotSource Solutions LLC is a Global Human Cap...   

                                         descript

In [3]:
#Total number of records
print("Total number of records:", len(df))

Total number of records: 17880


In [4]:
#Number of missing values per column
print("Missing values per column:")
print(df.isnull().sum())


Missing values per column:
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64


In [5]:
#Count of real vs fake jobs
print("Count of Real vs Fake Jobs:")
print(df["fraudulent"].value_counts())
print("\nLabel meaning:")
print("0 = Real Job")
print("1 = Fake Job")

Count of Real vs Fake Jobs:
fraudulent
0    17014
1      866
Name: count, dtype: int64

Label meaning:
0 = Real Job
1 = Fake Job


In [6]:
#Display 3 examples of fake job descriptions
print("Examples of Fake Job Descriptions:")
fake_jobs = df[df["fraudulent"] == 1]["description"].head(3)

# Print 3 sample fake descriptions
for i, desc in enumerate(fake_jobs, 1):
    print(f"Example {i}:\n{desc}\n{'-'*80}")

Examples of Fake Job Descriptions:
Example 1:
IC&amp;E Technician | Bakersfield, CA Mt. PosoPrincipal Duties and Responsibilities: Calibrates, tests, maintains, troubleshoots, and installs all power plant instrumentation, control systems and electrical equipment.Performs maintenance on motor control centers, motor operated valves, generators, excitation equipment and motors.Performs preventive, predictive and corrective maintenance on equipment, coordinating work with various team members.Designs and installs new equipment and/or system modifications.Troubleshoots and performs maintenance on DC backup power equipment, process controls, programmable logic controls (PLC), and emission monitoring equipment.Uses maintenance reporting system to record time and material use, problem identified and corrected, and further action required; provides complete history of maintenance on equipment.Schedule, coordinate, work with and monitor contractors on specific tasks, as required.Follows safe wor

Which feature has the most missing values?
The column with the most missing values is salary_range

Why job descriptions are more useful than job titles for fake detection?
Job descriptions contain detailed content, including responsibilities, requirements, and scam indicators, whereas titles are short and generic. Therefore, descriptions provide richer information for detecting fake job postings.

In [7]:
# Day 3: Text Cleaning and Preprocessing
 
import pandas as pd

import re

import string

import nltk

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
 
# Download resources (run once)

nltk.download('stopwords')

nltk.download('wordnet')
 
# Load dataset (same as Day 2)

df = pd.read_csv('Fake_Jobs_Postings.csv')
 
# Define text cleaning function

def clean_text(text):

    if pd.isnull(text):

        return ""

    # 1. Lowercase

    text = text.lower()

    # 2. Remove HTML tags

    text = re.sub(r'<.*?>', ' ', text)

    # 3. Remove URLs

    text = re.sub(r'http\S+|www\S+', '', text)

    # 4. Remove punctuation and numbers

    text = re.sub(r'[%s\d]' % re.escape(string.punctuation), ' ', text)

    # 5. Remove extra spaces

    text = re.sub(r'\s+', ' ', text).strip()

    # 6. Remove stopwords and lemmatize

    stop_words = set(stopwords.words('english'))

    lemmatizer = WordNetLemmatizer()

    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]

    return " ".join(words)
 
# Apply cleaning to key text columns

df['clean_description'] = df['description'].apply(clean_text)
 
# Show before and after

print("Original Text:\n", df['description'].iloc[1][:300])

print("\nCleaned Text:\n", df['clean_description'].iloc[1][:300])
 
# Check for any remaining issues

print("\nExample of Cleaned Data:")

print(df[['description', 'clean_description']].head(3))

 
'''Task 1:

Write your own cleaning function that:
Removes HTML, numbers, and punctuation
Converts text to lowercase
Removes stopwords
Returns a cleaned version of the company_profile column
Task 2:
Count the average number of words before and after cleaning for job descriptions.
Discuss: Did the cleaning remove too much or just enough?
 '''

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text:
 Organised - Focused - Vibrant - Awesome!Do you have a passion for customer service? Slick typing skills? Maybe Account Management? ...And think administration is cooler than a polar bear on a jetski? Then we need to hear you! We are the Cloud Video Production Service and opperating on a glodal level

Cleaned Text:
 organised focused vibrant awesome passion customer service slick typing skill maybe account management think administration cooler polar bear jetski need hear cloud video production service opperating glodal level yeah pretty cool serious delivering world class product excellent customer service rap

Example of Cleaned Data:
                                         description  \
0  Food52, a fast-growing, James Beard Award-winn...   
1  Organised - Focused - Vibrant - Awesome!Do you...   
2  Our client, located in Houston, is actively se...   

                                   clean_description  
0  food fast growing james beard award winning on...  
1  or

'Task 1:\n\nWrite your own cleaning function that:\nRemoves HTML, numbers, and punctuation\nConverts text to lowercase\nRemoves stopwords\nReturns a cleaned version of the company_profile column\nTask 2:\nCount the average number of words before and after cleaning for job descriptions.\nDiscuss: Did the cleaning remove too much or just enough?\n '

In [8]:
"""Task 1:

Write your own cleaning function that:
Removes HTML, numbers, and punctuation
Converts text to lowercase
Removes stopwords
Returns a cleaned version of the company_profile column"""
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

# Download once if not already
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('Fake_Jobs_Postings.csv')

# Define custom cleaning function for company_profile
def clean_company_profile(text):
    if pd.isnull(text):
        return ""

    # 1. Lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # 3. Remove numbers and punctuation
    text = re.sub(r'[%s\d]' % re.escape(string.punctuation), ' ', text)

    # 4. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 5. Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]

    return " ".join(words)

# Apply function
df['clean_company_profile'] = df['company_profile'].apply(clean_company_profile)

# Display before and after
print("Original Company Profile:\n", df['company_profile'].iloc[1][:300])
print("\nCleaned Company Profile:\n", df['clean_company_profile'].iloc[1][:300])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Company Profile:
 90 Seconds, the worlds Cloud Video Production Service.90 Seconds is the worlds Cloud Video Production Service enabling brands and agencies to get high quality online video content shot and produced anywhere in the world. 90 Seconds makes video production fast, affordable, and all managed seamlessly 

Cleaned Company Profile:
 seconds worlds cloud video production service seconds worlds cloud video production service enabling brands agencies get high quality online video content shot produced anywhere world seconds makes video production fast affordable managed seamlessly cloud purchase publish http url fbe afac cd c f b 


In [9]:
"""Task 2:
Count the average number of words before and after cleaning for job descriptions.
Discuss: Did the cleaning remove too much or just enough?"""
# Define helper function to count words
def word_count(text):
    if pd.isnull(text):
        return 0
    return len(text.split())

# Count before and after cleaning
df['original_word_count'] = df['description'].apply(word_count)
df['cleaned_word_count'] = df['clean_description'].apply(word_count)

# Calculate average word counts
avg_before = df['original_word_count'].mean()
avg_after = df['cleaned_word_count'].mean()

print(f"\nAverage number of words before cleaning: {avg_before:.2f}")
print(f"Average number of words after cleaning: {avg_after:.2f}")


KeyError: 'clean_description'

In [None]:
# Day 4: Feature Extraction using BoW and TF-IDF
 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
# Load preprocessed dataset (from Day 3)

df = pd.read_csv('Fake_Jobs_Postings.csv')
 
# Assume we already created a 'clean_description' column

texts = df['clean_description'].fillna('').tolist()
 
# 1️⃣ Bag-of-Words

bow_vectorizer = CountVectorizer(max_features=2000)  # limit to top 2000 words

X_bow = bow_vectorizer.fit_transform(texts)
 
print("BoW shape:", X_bow.shape)

print("Sample feature names (BoW):", bow_vectorizer.get_feature_names_out()[:10])
 
# 2️⃣ TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_features=2000)

X_tfidf = tfidf_vectorizer.fit_transform(texts)
 
print("\nTF-IDF shape:", X_tfidf.shape)

print("Sample feature names (TF-IDF):", tfidf_vectorizer.get_feature_names_out()[:10])
 
# 3️⃣ Compare sparsity and values

print("\nExample BoW vector (first row):")

print(X_bow[0].toarray())
 
print("\nExample TF-IDF vector (first row):")

print(X_tfidf[0].toarray())

 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 1. Drop missing values from company_profile
df['company_profile'] = df['company_profile'].fillna('')

# 2. Initialize Vectorizers
bow_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# 3. Fit and transform the text
bow_features = bow_vectorizer.fit_transform(df['company_profile'])
tfidf_features = tfidf_vectorizer.fit_transform(df['company_profile'])

# 4. Compare shapes
print("BoW shape:", bow_features.shape)
print("TF-IDF shape:", tfidf_features.shape)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# 1. Fill missing values
df['description'] = df['description'].fillna('')

# 2. Create Bag of Words
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
bow_matrix = vectorizer.fit_transform(df['description'])

# 3. Sum up word counts
word_counts = bow_matrix.toarray().sum(axis=0)

# 4. Create a DataFrame of words and their counts
bow_df = pd.DataFrame({
    'Word': vectorizer.get_feature_names_out(),
    'Count': word_counts
})

# 5. Sort and display top 20
top20_words = bow_df.sort_values(by='Count', ascending=False).head(20)
print(top20_words)


In [1]:
# Day 5: Logistic Regression Model for Fake Job Detection
 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
# Load dataset (preprocessed with clean_description)
df = pd.read_csv('Fake_Jobs_Postings.csv')
df = df.dropna(subset=['description'])

 
# 1 Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['description'])
y = df['fraudulent']
 
# 2 Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
 
# 3 Train Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
 
# 4 Make predictions
y_pred = model.predict(X_test)
 
# 5 Evaluate performance
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
 
# 6 Check example predictions
test_samples = [
    "Work from home! Limited vacancies. Apply now.",
    "We are hiring a data scientist for our Bangalore office."
]
sample_features = vectorizer.transform(test_samples)
print("\nSample Predictions:", model.predict(sample_features))


Accuracy: 0.9658836689038032

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3403
           1       0.98      0.30      0.46       173

    accuracy                           0.97      3576
   macro avg       0.97      0.65      0.72      3576
weighted avg       0.97      0.97      0.96      3576


Confusion Matrix:
 [[3402    1]
 [ 121   52]]

Sample Predictions: [0 0]
