In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from scipy.sparse import hstack
import pickle
import warnings
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\+?\d[\d -]{8,}\d", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

warnings.filterwarnings('ignore')

print("="*60)
print("FAKE JOB POSTING DETECTION - MODEL TRAINING")
print("="*60)

FAKE JOB POSTING DETECTION - MODEL TRAINING


In [23]:
# Load the dataset
df = pd.read_csv('fake_job_postings-checkpoint.csv')
text_columns = [
    "title",
    "location",
    "department",
    "company_profile",
    "description",
    "requirements",
    "benefits",
    "employment_type",
    "required_experience",
    "required_education",
    "industry",
    "function"
]

for col in text_columns:
    df[col] = df[col].fillna("").apply(clean_text)


print("\nðŸ“Š Dataset Information:")
print(df.columns)
print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['fraudulent'].value_counts())
print(f"\nFraudulent percentage: {(df['fraudulent'].sum() / len(df) * 100):.2f}%")

# Display first few rows
df.head()


ðŸ“Š Dataset Information:
Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')
Dataset shape: (17880, 18)

Class distribution:
fraudulent
0    17014
1      866
Name: count, dtype: int64

Fraudulent percentage: 4.84%


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,marketing intern,us ny new york,marketing,,we re food and we ve created a groundbreaking ...,food a fast growing james beard award winning ...,experience with content management systems a m...,,0,1,0,other,internship,,,marketing,0
1,2,customer service cloud video production,nz auckland,success,,seconds the worlds cloud video production serv...,organised focused vibrant awesome do you have ...,what we expect from you your key responsibilit...,what you will get from usthrough being part of...,0,1,0,full time,not applicable,,marketing and advertising,customer service,0
2,3,commissioning machinery assistant cma,us ia wever,,,valor services provides workforce solutions th...,our client located in houston is actively seek...,implement pre commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,account executive washington dc,us dc washington,sales,,our passion for improving quality of life thro...,the company esri environmental systems researc...,education bachelor s or master s in gis busine...,our culture is anything but corporate we have ...,0,1,0,full time,mid senior level,bachelor s degree,computer software,sales,0
4,5,bill review manager,us fl fort worth,,,spotsource solutions llc is a global human cap...,job title itemization review managerlocation f...,qualifications rn license in the state of texa...,full benefits offered,0,1,1,full time,mid senior level,bachelor s degree,hospital health care,health care provider,0


In [24]:
print("\nðŸ”„ Preprocessing data...")

# Fill missing values
text_columns = ['company_profile', 'description', 'requirements', 'benefits', 
                'title', 'location', 'department', 'salary_range', 'employment_type',
                'required_experience', 'required_education', 'industry', 'function']

for col in text_columns:
    df[col] = df[col].fillna('')

# Combine textual fields
df['combined_text'] = (
    df['title'] + ' ' + 
    df['location'] + ' ' + 
    df['department'] + ' ' + 
    df['company_profile'] + ' ' + 
    df['description'] + ' ' + 
    df['requirements'] + ' ' + 
    df['benefits'] + ' ' +
    df['employment_type'] + ' ' +
    df['required_experience'] + ' ' +
    df['required_education'] + ' ' +
    df['industry'] + ' ' +
    df['function']
)

# Create additional features
df['text_length'] = df['combined_text'].apply(len)
df['word_count'] = df['combined_text'].apply(lambda x: len(x.split()))
df['has_company_logo'] = df['has_company_logo'].fillna(0).astype(int)
df['has_questions'] = df['has_questions'].fillna(0).astype(int)
df['telecommuting'] = df['telecommuting'].fillna(0).astype(int)
df['has_salary'] = df['salary_range'].apply(lambda x: 0 if x == '' else 1)

print("âœ… Preprocessing completed!")


ðŸ”„ Preprocessing data...
âœ… Preprocessing completed!


In [26]:
# Prepare features and target
X_text = df['combined_text']
X_numeric = df[['text_length', 'word_count', 'has_company_logo', 
                'has_questions', 'telecommuting', 'has_salary']]
y = df['fraudulent']

# Split data
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nðŸ“ˆ Data Split:")
print(f"Training set size: {len(X_train_text)}")
print(f"Test set size: {len(X_test_text)}")



ðŸ“ˆ Data Split:
Training set size: 14304
Test set size: 3576


In [27]:
print("\nðŸ”¤ Applying TF-IDF vectorization...")
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Combine features
X_train_combined = hstack([X_train_tfidf, X_train_num.values])
X_test_combined = hstack([X_test_tfidf, X_test_num.values])
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_combined, y_train)

print("Before SMOTE:", X_train_combined.shape)
print("After SMOTE:", X_train_resampled.shape)



ðŸ”¤ Applying TF-IDF vectorization...
Before SMOTE: (14304, 5006)
After SMOTE: (27222, 5006)


In [11]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced', n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)
}

results = {}

print("\n" + "="*60)
print("TRAINING AND EVALUATING MODELS")
print("="*60)

for name, model in models.items():
    print(f"\n Training {name}...")
    model.fit(X_train_combined, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_combined)
    y_pred_proba = model.predict_proba(X_test_combined)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'predictions': y_pred
    }
    
    print(f"\nðŸ“Š {name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent']))
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f"True Negatives: {cm[0][0]}, False Positives: {cm[0][1]}")
    print(f"False Negatives: {cm[1][0]}, True Positives: {cm[1][1]}")


TRAINING AND EVALUATING MODELS

 Training Logistic Regression...

ðŸ“Š Logistic Regression Results:
Accuracy: 0.9374
ROC-AUC Score: 0.9845

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      0.94      0.97      3403
  Fraudulent       0.43      0.92      0.59       173

    accuracy                           0.94      3576
   macro avg       0.71      0.93      0.78      3576
weighted avg       0.97      0.94      0.95      3576


Confusion Matrix:
[[3193  210]
 [  14  159]]
True Negatives: 3193, False Positives: 210
False Negatives: 14, True Positives: 159

 Training Random Forest...

ðŸ“Š Random Forest Results:
Accuracy: 0.9793
ROC-AUC Score: 0.9928

Classification Report:
              precision    recall  f1-score   support

  Legitimate       0.98      1.00      0.99      3403
  Fraudulent       1.00      0.57      0.73       173

    accuracy                           0.98      3576
   macro avg       0.99      0.79      0.

In [12]:
import os

# Create model directory if it doesn't exist
model_dir = 'model'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f" Created '{model_dir}' directory")

# Select the best model
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']

print("\n" + "="*60)
print(f" BEST MODEL: {best_model_name}")
print("="*60)
print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"ROC-AUC Score: {results[best_model_name]['roc_auc']:.4f}")

# Save the best model in the model folder
print("\n Saving model and vectorizer...")

with open(os.path.join(model_dir, 'best_model.pkl'), 'wb') as f:
    pickle.dump(best_model, f)

with open(os.path.join(model_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf, f)

# Save feature info
feature_info = {
    'numeric_features': ['text_length', 'word_count', 'has_company_logo', 
                         'has_questions', 'telecommuting', 'has_salary'],
    'model_name': best_model_name
}

with open(os.path.join(model_dir, 'model_info.pkl'), 'wb') as f:
    pickle.dump(feature_info, f)

print("\n Model training completed successfully!")
print(f"\n Files saved in '{model_dir}' folder:")
print(f"   - {model_dir}/best_model.pkl")
print(f"   - {model_dir}/tfidf_vectorizer.pkl")
print(f"   - {model_dir}/model_info.pkl")
print("\n Ready to run the Streamlit app!")

 Created 'model' directory

 BEST MODEL: Random Forest
Accuracy: 0.9793
ROC-AUC Score: 0.9928

 Saving model and vectorizer...

 Model training completed successfully!

 Files saved in 'model' folder:
   - model/best_model.pkl
   - model/tfidf_vectorizer.pkl
   - model/model_info.pkl

 Ready to run the Streamlit app!
