In [1]:
%pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.0/12.8 MB 281.8 kB/s eta 0:00:46
      -------------------------------------- 0.2/12.8 MB 952.6 kB/s eta 0:00:14
     - -------------------------------------- 0.6/12.8 MB 2.7 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/12.8 MB 3.7 MB/s eta 0:00:04
     ---- ----------------------------------- 1.5/12.8 MB 4.6 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 5.1 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 5.9 MB/s eta 0:00:02
     -------- ------------------------------

In [1]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
df = pd.read_csv("tech_jobs_raw.csv")
df = df.dropna(subset=["title", "description"])
df["text"] = df["title"] + " " + df["description"]

In [4]:
# Cleaning function
def clean_spacy(text):
    doc = nlp(text.lower())
    return " ".join([
        token.lemma_ for token in doc
        if not token.is_stop and token.is_alpha
    ])

# Apply cleaning
df["clean_text"] = df["text"].apply(clean_spacy)

In [6]:
# Save cleaned file
df.to_csv("tech_jobs_cleaned.csv", index=False)
print("Done! Cleaned data saved to tech_jobs_cleaned.csv")

Done! Cleaned data saved to tech_jobs_cleaned.csv


In [8]:
#We labeled the clean data maunually and saved it into a file called tech_jobs_labeled.csv
df_labeled = pd.read_csv("tech_jobs_labeled.csv")

In [12]:
# scam keyword lists based on the description
hard_flags = [
    "quick money", "get paid", "click here", "zelle", "venmo",
    "no experience", "send resume", "easy job", "start today", "ssn",
    "cash job", "flexible schedule", "personal assistant", "urgent hire",
    "no background check", "hiring now", "apply fast", "bonus pay", "work today",
    "daily payout", "text me", "hmu", "cash app", "bitcoin"
]

soft_flags = [
    "fun work", "work from home", "startup", "bonus", "remote",
    "side hustle", "freedom", "earn daily", "quick cash", "instant pay",
    "just need", "willing to pay", "dm me", "pay out", "commission only"
]


# suspicious score calculation
def calc_suspicious_score(text):
    text = str(text).lower()
    score = 0
    score += sum(text.count(kw) * 2 for kw in hard_flags)
    score += sum(text.count(kw) for kw in soft_flags)
    return score

# vague flag calculation
def is_vague(text):
    text = str(text).lower()
    word_count = len(text.split())

    vague_indicators = [
        "need help", "looking for someone", "personal assistant",
        "quick cash", "just need", "easy job", "should be reliable",
        "make money"
    ]
    strong_indicators = [
        "technician", "software", "engineer", "laboratory",
        "install", "support", "analysis", "requirements",
        "experience", "responsibilities"
    ]

    has_vague_phrases = any(phrase in text for phrase in vague_indicators)
    lacks_detail = not any(word in text for word in strong_indicators)

    return int(word_count < 25 and (has_vague_phrases or lacks_detail))

# Apply the functions to your dataframe
df_labeled["suspicious_score"] = df_labeled["clean_text"].apply(calc_suspicious_score)
df_labeled["is_vague"] = df_labeled["text"].apply(is_vague)

In [14]:
# Select only the features you want
df_reduced = df_labeled[[
    "clean_text",       
    "suspicious_score", 
    "is_vague",         
    "compensation",    
    "is_scam"           
]]

# Preview the final structure
df_reduced.head()

# Export to CSV
df_reduced.to_csv("tech_jobs.csv", index=False)
print("File saved as tech_jobs_reduced.csv")

File saved as tech_jobs_reduced.csv
