In [None]:
!pip install pandas numpy autogluon geopy underthesea requests beautifulsoup4 regex tqdm sentencepiece transformers torch
# Nếu chạy trên colab / local, thêm: pip install sentencepiece transformers torch


In [None]:
# fadaml_jobs.py
import re, os, json, time
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# NLP / NER helpers
try:
    from underthesea import ner, word_tokenize
except Exception:
    ner = None
    def word_tokenize(s): return s.split()

# Geo helper (optional)
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

# AutoGluon
from autogluon.tabular import TabularDataset, TabularPredictor

# ---------- 1) Crawl or load data ----------
# Option A: load CSV you already have
DATA_PATH = "job_posts.csv"  # <-- đổi thành file job của bạn
df = pd.read_csv(DATA_PATH)   # expecting columns: 'raw_text' or 'description', maybe 'label' if you have

# Option B: (template) simple scraper for a job-listing page (example)
# (uncomment and adapt if you need to crawl)
"""
import requests
from bs4 import BeautifulSoup
def crawl_job_page(url):
    r = requests.get(url, timeout=10)
    soup = BeautifulSoup(r.text, 'html.parser')
    # Example selectors: adapt per site
    title = soup.select_one('h1') and soup.select_one('h1').get_text(strip=True)
    desc = soup.select_one('.job-description') and soup.select_one('.job-description').get_text(' ', strip=True)
    company = soup.select_one('.company') and soup.select_one('.company').get_text(strip=True)
    location = soup.select_one('.location') and soup.select_one('.location').get_text(strip=True)
    return {'title': title, 'description': desc, 'company': company, 'location': location}
"""

# ---------- 2) Preprocessing ----------
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s)
    s = re.sub(r'<.*?>', ' ', s)                # remove html tags
    s = re.sub(r'\s+', ' ', s).strip()
    s = s.lower()
    return s

text_col = None
for candidate in ['description','raw_text','text','job_description']:
    if candidate in df.columns:
        text_col = candidate
        break
if text_col is None:
    raise RuntimeError("Không tìm cột text — đổi tên cột chứa job description thành 'description' or 'raw_text'")

df['description_clean'] = df[text_col].apply(clean_text)

# ---------- 3) NER + Tabular feature extraction ----------
# We'll extract: salary, location, company, job_title, employment_type, experience, remote_flag, skills_count
salary_regex = re.compile(r'(\d+(?:[.,]\d+)?\s*(?:tỷ|triệu|vnđ|vnd|k|m|usd|\$))', flags=re.I)
salary_num_regex = re.compile(r'(\d[\d\.,]*)')

def extract_salary(s):
    m = salary_regex.search(s)
    if not m: return np.nan
    num = salary_num_regex.search(m.group(1))
    if not num: return np.nan
    raw = num.group(1).replace(',','').replace('.','')
    try:
        v = float(raw)
    except:
        return np.nan
    # heuristics: if contains 'tỷ' -> *1e3 (store in million VND)
    if 'tỷ' in m.group(1):
        return v * 1000.0
    if 'triệu' in m.group(1) or 'm' in m.group(1):
        return v
    if 'k' in m.group(1):
        return v * 0.001
    # USD -> you may convert externally
    return v

def extract_remote(s):
    return int(bool(re.search(r'remote|làm việc từ xa|work from home|wfh', s, flags=re.I)))

def extract_job_title(s, fallback=None):
    # Try simple heuristics: line start; or field 'title' if exists in df
    return fallback if fallback else ''

# use underthesea NER if available (Vietnamese)
def extract_company_location_experience(s):
    ent = {}
    if ner:
        try:
            res = ner(s)  # underthesea ner returns list of (token, label)
            # naive: join tokens labeled ORG -> company, LOC -> location
            orgs = []
            locs = []
            for token, label in res:
                if label == 'ORG': orgs.append(token)
                if label == 'LOC': locs.append(token)
            ent['company'] = ' '.join(orgs) if orgs else None
            ent['location'] = ' '.join(locs) if locs else None
        except Exception:
            ent['company'] = None
            ent['location'] = None
    else:
        ent['company'] = None
        ent['location'] = None
    # experience level heuristic
    if re.search(r'senior|sr|trên 5 năm|5\+ năm|thâm niên', s, flags=re.I):
        ent['experience_level'] = 'senior'
    elif re.search(r'junior|fresh|mới tốt nghiệp|0-1 năm', s, flags=re.I):
        ent['experience_level'] = 'junior'
    elif re.search(r'mid|2-4 năm|trên 2 năm|3 năm', s, flags=re.I):
        ent['experience_level'] = 'mid'
    else:
        ent['experience_level'] = None
    return ent

tq = tqdm(df['description_clean'].fillna('').tolist(), desc='extract features', total=len(df))
companies, locations, experiences, salaries = [], [], [], []
remote_flags, skills_counts = [], []
for s in tq:
    salaries.append(extract_salary(s))
    remote_flags.append(extract_remote(s))

    ent = extract_company_location_experience(s)
    companies.append(ent.get('company'))
    locations.append(ent.get('location'))
    experiences.append(ent.get('experience_level'))

    # skills_count: count occurrences of 'skill-like' words (simple)
    skills = re.findall(r'\b(python|java|sql|excel|aws|docker|react|node|ml|ai|tensorflow|pytorch)\b', s, flags=re.I)
    skills_counts.append(len(set(skills)))

df['salary_million'] = salaries
df['is_remote'] = remote_flags
df['company_extracted'] = companies
df['location_extracted'] = locations
df['experience_level'] = experiences
df['skills_count'] = skills_counts

# ---------- 4) Feature enrichment (optional geospatial) ----------
# If you have location strings, use GeoPy to get lat/lon, then nearest big-city features
geolocator = Nominatim(user_agent="fadaml_jobs_geocoder")
def geocode_safe(loc):
    try:
        if pd.isna(loc): return (np.nan,np.nan)
        res = geolocator.geocode(loc + ", Vietnam", timeout=10)
        if not res: return (np.nan,np.nan)
        return (res.latitude, res.longitude)
    except Exception:
        return (np.nan,np.nan)

# Be careful: geocoding many rows can be slow + rate-limited. Do only if needed and cache results.
if 'location_extracted' in df.columns:
    geocache = {}
    latitudes, longitudes = [], []
    for loc in tqdm(df['location_extracted'].fillna('').unique(), desc='geocoding unique locs'):
        if loc=='':
            geocache[loc] = (np.nan,np.nan)
        else:
            geocache[loc] = geocode_safe(loc)
    for loc in df['location_extracted'].fillna(''):
        lat,lon = geocache.get(loc,(np.nan,np.nan))
        latitudes.append(lat); longitudes.append(lon)
    df['lat'] = latitudes; df['lon'] = longitudes

# ---------- 5) Data cleaning, outlier removal, dedup ----------
# Dedup by description text
df = df.drop_duplicates(subset=['description_clean'])
# Remove entries lacking core info (optionally)
df = df.reset_index(drop=True)

# ---------- 6) Prepare dataset for AutoGluon ----------
# Map to features list (similar role as Table 4 in paper)
features = [
    'description_clean',  # text
    'salary_million',      # numeric
    'company_extracted',   # categorical
    'location_extracted',
    'experience_level',
    'is_remote',
    'skills_count',
    'lat','lon'            # optional geospatial
]
# If you have label column, keep it. Otherwise assume you will create label by heuristic.
label_col = 'label'  # expected 0/1 where 1 = fake (adapt)
if label_col not in df.columns:
    # Heuristic labelling example: salary extremely low/high vs median for location/job -> mark suspect
    df['salary_million_filled'] = df['salary_million'].fillna(df['salary_million'].median())
    median_by_loc = df.groupby('location_extracted')['salary_million_filled'].median().replace({np.nan:df['salary_million_filled'].median()})
    df['median_loc'] = df['location_extracted'].map(median_by_loc)
    # label = 1 if salary deviates > 4x or missing critical info (no company + no location)
    df['label'] = ((df['salary_million_filled'] > 4 * df['median_loc']) | (df['salary_million_filled'] < 0.25 * df['median_loc'])) | (df['company_extracted'].isna() & df['location_extracted'].isna())
    df['label'] = df['label'].astype(int)

train_df = df[features + ['label']].copy()
train_df = train_df.sample(frac=1.0, random_state=42).reset_index(drop=True)  # shuffle

# convert to TabularDataset
train_tab = TabularDataset(train_df)

# ---------- 7) Train AutoGluon (mimic paper: ngram (1,3), max_features 10000, memory ratio 0.15) ----------
save_path = "Autogluon_FADAML_jobs"  # where models stored
predictor = TabularPredictor(label='label', path=save_path, eval_metric='f1').fit(
    train_data=train_tab,
    presets=['best_quality'],  # mimics strong config; can change to 'medium_quality_faster_train' for speed
    # you can pass hyperparameters to force certain models or control n-gram; using defaults often ok
    time_limit=3600  # optional: allow up to 1 hour; change as you need
)

# ---------- 8) Evaluate ----------
# If you have holdout split
from sklearn.metrics import classification_report
train_eval = predictor.evaluate(train_tab)
y_pred = predictor.predict(train_tab)
print(classification_report(train_tab['label'], y_pred))

# ---------- 9) Feature importance (permutation) and ablation ----------
fi = predictor.feature_importance(train_tab)
print("Feature importance (top 20):")
print(fi.head(20))

# Ablation study example: remove geospatial features, retrain quickly (short time_limit)
ablation_train = train_tab.drop(columns=['lat','lon'])
predictor_ab = TabularPredictor(label='label', path=save_path + "_no_geo", eval_metric='f1').fit(
    train_data=ablation_train,
    presets=['medium_quality'],
    time_limit=600
)
print("Ablation (no geo) eval:")
print(predictor_ab.evaluate(ablation_train))

# Save processed dataset for inspection
train_df.to_csv("processed_jobs_multimodal.csv", index=False)
print("Saved processed dataset -> processed_jobs_multimodal.csv")
