In [None]:
# 1.import liberies
from google.colab import drive
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import joblib
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVC

In [None]:
# 2. Mount Google Drive
drive.mount('/content/drive')

# 3. Load Dataset
file_path = "/content/drive/MyDrive/master_job_scam_data.csv"
df = pd.read_csv(file_path)

In [None]:
# 4. Fill Missing Values
text_cols = ['title','company_profile','description','requirements','benefits',
             'location','employment_type','industry']

for col in text_cols:
    df[col] = df[col].fillna("Not Provided")

df['salary_range'] = df['salary_range'].fillna("Not Provided")

In [None]:
# 5. Combine Text
df['text_data'] = (
    df['title'] + ' ' +
    df['company_profile'] + ' ' +
    df['description'] + ' ' +
    df['requirements'] + ' ' +
    df['benefits']
)

In [None]:
# 6. Clean Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text_data'].apply(clean_text)

In [None]:
# 7. Prepare X & y
X = df['clean_text']
y = df['fraudulent']

In [None]:
# 8. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# 9. TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# 10. Balance the Data (Oversampling)
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X_train_tfidf, y_train)

print("Before Balancing:", Counter(y_train))
print("After Balancing:", Counter(y_bal))

In [None]:
# 11. Train SVM Model
svm_model = LinearSVC()
print("\nTraining SVM...")
svm_model.fit(X_bal, y_bal)

In [None]:
# 12. Evaluate SVM
y_pred = svm_model.predict(X_test_tfidf)

print("\nüîç SVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))

In [1]:
# üìå 13. Save the Best Model
joblib.dump(svm_model, "fake_job_detector_svm.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("\n‚úÖ SVM Model and TF-IDF saved successfully!")


Mounted at /content/drive
Before Balancing: Counter({0: 13369, 1: 8683})
After Balancing: Counter({0: 13369, 1: 13369})

Training SVM...

üîç SVM Confusion Matrix:
[[3331   12]
 [  43 2128]]

üìä Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3343
           1       0.99      0.98      0.99      2171

    accuracy                           0.99      5514
   macro avg       0.99      0.99      0.99      5514
weighted avg       0.99      0.99      0.99      5514


‚úÖ SVM Model and TF-IDF saved successfully!
