In [57]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import re
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
# Load the training dataset
df = pd.read_csv('Train_data_Job.csv') 

# Drop irrelevant column
df = df.drop(columns=['job_id'])
# Remove duplicates
print("Number of duplicates:", df.duplicated().sum())
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)
# Display basic info to verify loading
print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Number of duplicates: 181
Shape after removing duplicates: (14123, 17)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 14123 entries, 0 to 14302
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                14123 non-null  object
 1   location             13845 non-null  object
 2   department           4988 non-null   object
 3   salary_range         2258 non-null   object
 4   company_profile      11465 non-null  object
 5   description          14122 non-null  object
 6   requirements         12012 non-null  object
 7   benefits             8380 non-null   object
 8   telecommuting        14123 non-null  int64 
 9   has_company_logo     14123 non-null  int64 
 10  has_questions        14123 non-null  int64 
 11  employment_type      11392 non-null  object
 12  required_experience  8499 non-null   object
 13  required_education   7677 non-null   object
 14  industry             1

In [59]:
# Check class distribution
print("Class Distribution (Fraudulent vs Genuine):")
print(df['fraudulent'].value_counts(normalize=True))

Class Distribution (Fraudulent vs Genuine):
fraudulent
0    0.951285
1    0.048715
Name: proportion, dtype: float64


In [63]:
# Fill missing values for text columns
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for col in text_columns:
    df[col] = df[col].fillna("Not Provided")

# Fill missing values for categorical columns
categorical_columns = ['location', 'department', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
for col in categorical_columns:
    df[col] = df[col].fillna("Unknown")
# Verify missing values are handled
print("Missing Values After Handling:")
print(df.isnull().sum())

Missing Values After Handling:
title                      0
location                   0
department                 0
salary_range           11865
company_profile            0
description                0
requirements               0
benefits                   0
telecommuting              0
has_company_logo           0
has_questions              0
employment_type            0
required_experience        0
required_education         0
industry                   0
function                   0
fraudulent                 0
dtype: int64


In [79]:
# Enhanced text cleaning
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def advanced_preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply to text columns separately
df['description_clean'] = df['description'].apply(advanced_preprocess_text)
df['company_profile_clean'] = df['company_profile'].apply(advanced_preprocess_text)
df['title_clean'] = df['title'].apply(advanced_preprocess_text)

# TF-IDF for each text column
tfidf_desc = TfidfVectorizer(max_features=3000)
tfidf_comp = TfidfVectorizer(max_features=1000)
tfidf_title = TfidfVectorizer(max_features=500)
X_desc = tfidf_desc.fit_transform(df['description_clean'])
X_comp = tfidf_comp.fit_transform(df['company_profile_clean'])
X_title = tfidf_title.fit_transform(df['title_clean'])

# Process salary_range
df['has_salary_range'] = df['salary_range'].notnull().astype(int)
def parse_salary(salary):
    if pd.isna(salary):
        return 0  # Impute with 0 instead of median
    try:
        salary = salary.replace('$', '').replace(',', '')
        low, high = salary.split('-')
        return (float(low) + float(high)) / 2
    except:
        return 0

df['salary_avg'] = df['salary_range'].apply(parse_salary)

# Target encoding for categorical features
cat_features = ['employment_type', 'required_experience', 'industry']
for col in cat_features:
    fraud_rate = df.groupby(col)['fraudulent'].mean()
    df[f'{col}_target_enc'] = df[col].map(fraud_rate)

# Add fraud-specific features
df['desc_length'] = df['description'].apply(len)
#df['urgent_flag'] = df['description'].str.contains('urgent|immediate', case=False, na=False).astype(int)
df['urgent_flag'] = df['description'].str.contains('urgent|immediate|asap|now|pressing|hurry|limited time', case=False, na=False).astype(int)
# Combine all features
binary_features = ['telecommuting', 'has_company_logo', 'has_questions', 'has_salary_range']
X_binary = df[binary_features].values
X_extra = df[['salary_avg', 'desc_length', 'urgent_flag', 'employment_type_target_enc', 'required_experience_target_enc', 'industry_target_enc']].values
X = np.hstack((X_desc.toarray(), X_comp.toarray(), X_title.toarray(), X_binary, X_extra))
y = df['fraudulent'].values

# Apply SMOTE to training set
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_temp, y_temp)

# Train XGBoost with scale_pos_weight
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Evaluate with fixed threshold of 0.2
y_pred_proba_xgb = xgb_model.predict_proba(X_val)[:, 1]
y_pred_xgb = (y_pred_proba_xgb >= 0.2).astype(int)
print("XGBoost Classification Report (Threshold = 0.2):")
print(classification_report(y_val, y_pred_xgb, target_names=['Genuine', 'Fraudulent']))

# Save model and preprocessors
joblib.dump(xgb_model, 'job_fraud_xgb_revised_threshold_0_2.pkl')
joblib.dump(tfidf_desc, 'tfidf_desc_revised.pkl')
joblib.dump(tfidf_comp, 'tfidf_comp_revised.pkl')
joblib.dump(tfidf_title, 'tfidf_title_revised.pkl')


XGBoost Classification Report (Threshold = 0.2):
              precision    recall  f1-score   support

     Genuine       0.99      0.99      0.99      2687
  Fraudulent       0.80      0.85      0.82       138

    accuracy                           0.98      2825
   macro avg       0.90      0.92      0.91      2825
weighted avg       0.98      0.98      0.98      2825



['tfidf_title_revised.pkl']

In [81]:
# Load the test dataset
df_test = pd.read_csv('Test_data_Job.csv')

# Drop irrelevant column
df_test = df_test.drop(columns=['job_id'])

# Remove duplicates
df_test = df_test.drop_duplicates()

# Fill missing values for text columns
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for col in text_columns:
    df_test[col] = df_test[col].fillna("Not Provided")

# Fill missing values for categorical columns
categorical_columns = ['location', 'department', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
for col in categorical_columns:
    df_test[col] = df_test[col].fillna("Unknown")

# Apply the same text preprocessing
df_test['description_clean'] = df_test['description'].apply(advanced_preprocess_text)
df_test['company_profile_clean'] = df_test['company_profile'].apply(advanced_preprocess_text)
df_test['title_clean'] = df_test['title'].apply(advanced_preprocess_text)

# Transform using the same TF-IDF vectorizers
X_desc_test = tfidf_desc.transform(df_test['description_clean'])
X_comp_test = tfidf_comp.transform(df_test['company_profile_clean'])
X_title_test = tfidf_title.transform(df_test['title_clean'])

# Process salary_range
df_test['has_salary_range'] = df_test['salary_range'].notnull().astype(int)
df_test['salary_avg'] = df_test['salary_range'].apply(parse_salary)

# Target encoding for categorical features (using training set mappings)
for col in cat_features:
    df_test[f'{col}_target_enc'] = df_test[col].map(fraud_rate)  # fraud_rate from training set
    df_test[f'{col}_target_enc'] = df_test[f'{col}_target_enc'].fillna(fraud_rate.mean())  # Handle unseen categories

# Add fraud-specific features
df_test['desc_length'] = df_test['description'].apply(len)
df_test['urgent_flag'] = df_test['description'].str.contains('urgent|immediate|asap|now|pressing|hurry|limited time', case=False, na=False).astype(int)

# Combine all features
X_binary_test = df_test[binary_features].values
X_extra_test = df_test[['salary_avg', 'desc_length', 'urgent_flag', 'employment_type_target_enc', 'required_experience_target_enc', 'industry_target_enc']].values
X_test = np.hstack((X_desc_test.toarray(), X_comp_test.toarray(), X_title_test.toarray(), X_binary_test, X_extra_test))


# Predict on test set using XGBoost
y_pred_test = (xgb_model.predict_proba(X_test)[:, 1] >= 0.2).astype(int)



# Save test predictions
df_test['Prediction'] = y_pred_test
df_test['Fraud_Probability'] = xgb_model.predict_proba(X_test)[:, 1]
df_test.to_csv('test_predictions_xgb_final.csv', index=False)
print("Test predictions saved to 'test_predictions_xgb_final.csv'.")

Test predictions saved to 'test_predictions_xgb_final.csv'.


In [77]:
print("Total test samples:", len(df_test))
print("Predicted fraudulent postings:", df_test['Prediction'].sum())
print("Proportion predicted as fraudulent:", df_test['Prediction'].mean())

Total test samples: 3564
Predicted fraudulent postings: 207
Proportion predicted as fraudulent: 0.05808080808080808


In [83]:
fraud_rate = {}
cat_features = ['employment_type', 'required_experience', 'industry']
for col in cat_features:
    fraud_rate[col] = df.groupby(col)['fraudulent'].mean()
joblib.dump(fraud_rate, 'fraud_rate.pkl')

['fraud_rate.pkl']