In [1]:
"""
Resume Classification (English, LLM + smart schema)
- Loads CSV (flexible columns: tries Category/category/... and Resume/Cleaned_Resume/summary/...)
- EDA (counts/plots/wordcloud) — optional if matplotlib available
- Fine-tunes DistilBERT
- Saves model/tokenizer/labels.json to llm_model_en/

Deps:
  pip install pandas scikit-learn matplotlib wordcloud transformers datasets accelerate torch
"""

import os, re, sys, json, warnings
from collections import Counter
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# plotting (اختياري)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)

# =================== CONFIG ===================
DATASET_PATH = os.environ.get(
    "RESUME_DATASET_PATH",
    "/Users/reemabalharith/Desktop/Capston Project/english_cv_dataset_full_6800_enriched.csv"
)
RANDOM_STATE = 42
TEST_SIZE = 0.2
PLOTS_DIR = "plots_en"
MODEL_DIR  = "llm_model_en"

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256
EPOCHS = 10
BATCH_TRAIN = 16
BATCH_EVAL = 32
LR = 2e-5

os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# =================== Utils ===================
POSSIBLE_LABEL_COLS = ["Category", "category", "Label", "label", "target"]
POSSIBLE_TEXT_COLS  = ["Cleaned_Resume", "Resume", "summary", "Summary", "text", "body", "description"]

def load_dataset(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        print(f"[ERROR] Dataset not found at: {path}")
        sys.exit(1)
    return pd.read_csv(path)

def resolve_columns(df: pd.DataFrame):
    label_col = next((c for c in POSSIBLE_LABEL_COLS if c in df.columns), None)
    if label_col is None:
        raise SystemExit(f"[ERROR] Couldn't find label column. Tried: {POSSIBLE_LABEL_COLS}. Found: {df.columns.tolist()}")

    text_col = next((c for c in POSSIBLE_TEXT_COLS if c in df.columns), None)
    if text_col is None:
        parts = [c for c in [
            "summary","skills","tools","projects","education_degree","education_university",
            "education_year","years_experience","last_company","job_title","city","keywords","name"
        ] if c in df.columns]
        if not parts:
            raise SystemExit(f"[ERROR] Couldn't find any text columns. Tried: {POSSIBLE_TEXT_COLS} or parts list.")
        df["_COMBINED_TEXT_"] = df[parts].astype(str).fillna("").agg(" ".join, axis=1)
        text_col = "_COMBINED_TEXT_"
    return label_col, text_col

def simple_clean(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = re.sub(r"<[^>]+>", " ", text)
    text = text.replace("â€¢", "•")
    text = text.replace("\\r", " ").replace("\\n", " ")
    text = text.replace("\r", " ").replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def top_frequent_words(texts, topn=20):
    all_text = " ".join(texts)
    tokens = [w.lower() for w in all_text.split() if w.lower() not in ENGLISH_STOP_WORDS]
    freq = Counter(tokens)
    return freq.most_common(topn), freq

def plot_category_bar(df: pd.DataFrame, label_col: str, out_path: str):
    plt.figure(figsize=(12, 6))
    df[label_col].value_counts().plot(kind="bar")
    plt.title("Resume Count per Category")
    plt.xlabel("Category")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

def plot_category_pie(df: pd.DataFrame, label_col: str, out_path: str, topk=7):
    counts = df[label_col].value_counts()
    top = counts.nlargest(topk)
    others = counts.sum() - top.sum()
    labels = top.index.tolist() + ["Others"]
    sizes = top.tolist() + [others]
    plt.figure(figsize=(8, 8))
    plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140)
    plt.title(f"Resume Category Distribution (Top {topk})")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

def plot_wordcloud(freq_counter, out_path: str):
    wc = WordCloud(width=1200, height=600, background_color="white")
    wc.generate_from_frequencies(freq_counter)
    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most Common Words")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

# =================== Main ===================
def main():
    print("[1] Loading dataset:", DATASET_PATH)
    df = load_dataset(DATASET_PATH)
    label_col, text_col = resolve_columns(df)

    print("\n[2] Basic info:")
    print(df[[label_col, text_col]].head())
    print(df.info())

    print("\n[3] Drop duplicates…")
    before = len(df)
    df = df.drop_duplicates().copy()
    after = len(df)
    print(f"Removed {before - after} duplicates. Remaining: {after}")

    print("\n[4] Cleaning text…")
    df["Cleaned_Resume"] = df[text_col].apply(simple_clean)
    print(df[[label_col, "Cleaned_Resume"]].sample(min(3, len(df)), random_state=RANDOM_STATE))

    print("\n[5] Category counts:")
    print(df[label_col].value_counts())
    try:
        plot_category_bar(df, label_col, os.path.join(PLOTS_DIR, "category_bar.png"))
        plot_category_pie(df, label_col, os.path.join(PLOTS_DIR, "category_pie.png"))
    except Exception as e:
        print("[WARN] Plotting failed (ok):", e)

    print("\n[6] Frequent words & wordcloud…")
    top20, freq = top_frequent_words(df["Cleaned_Resume"].tolist(), topn=20)
    print("Top 20 words:", top20)
    try:
        plot_wordcloud(freq, os.path.join(PLOTS_DIR, "wordcloud.png"))
    except Exception as e:
        print("[WARN] Wordcloud failed (ok):", e)

    print("\n[7] Train/Test split + LLM fine-tuning")
    X = df["Cleaned_Resume"]
    y = df[label_col]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    labels = sorted(y.unique().tolist())
    label2id = {lbl: i for i, lbl in enumerate(labels)}
    id2label = {i: lbl for i, lbl in enumerate(labels)}

    train_df = pd.DataFrame({"text": X_train.values, "label": [label2id[v] for v in y_train.values]})
    test_df  = pd.DataFrame({"text": X_test.values,  "label": [label2id[v] for v in y_test.values]})
    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    test_ds  = Dataset.from_pandas(test_df,  preserve_index=False)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    def tok_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

    train_tok = train_ds.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
    test_tok  = test_ds.map(tok_fn,  batched=True).remove_columns(["text"]).with_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
    )

    # ====== Legacy-friendly TrainingArguments (no evaluation_strategy/save_strategy) ======
    args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIR, "outputs"),
        learning_rate=LR,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=EPOCHS,
        logging_steps=50,
        seed=RANDOM_STATE,
    )

    def compute_metrics(eval_pred):
        logits, y_true_ids = eval_pred
        y_pred_ids = np.argmax(logits, axis=-1)
        acc = accuracy_score(y_true_ids, y_pred_ids)
        p, r, f1, _ = precision_recall_fscore_support(y_true_ids, y_pred_ids, average="weighted", zero_division=0)
        return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=test_tok,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # تقييم يدوي بعد التدريب
    eval_res = trainer.evaluate()
    print("\n[Eval Metrics]:", eval_res)

    preds_logits = trainer.predict(test_tok).predictions
    preds_idx = np.argmax(preds_logits, axis=-1)
    y_true = [id2label[i] for i in test_df["label"]]
    y_pred = [id2label[i] for i in preds_idx]
    print(f"\nAccuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

    print("\n[8] Save artifacts (HF model + tokenizer + labels.json)…")
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    with open(os.path.join(MODEL_DIR, "labels.json"), "w", encoding="utf-8") as f:
        json.dump({"labels": labels}, f, ensure_ascii=False, indent=2)
    print(f"Saved model -> {MODEL_DIR}")

if __name__ == "__main__":
    main()


[1] Loading dataset: /Users/reemabalharith/Desktop/Capston Project/english_cv_dataset_full_6800_enriched.csv

[2] Basic info:
               Category                                             Resume
0     AI / Data Science  Category: AI / Data Science. Experience: 1 yea...
1  Software Engineering  Category: Software Engineering. Experience: 7 ...
2         Cybersecurity  Category: Cybersecurity. Experience: 1 years. ...
3            Networking  Category: Networking. Experience: 8 years. Ski...
4                DevOps  Category: DevOps. Experience: 9 years. Skills:...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6800 entries, 0 to 6799
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id_x             6800 non-null   int64  
 1   Category         6800 non-null   object 
 2   Resume           6800 non-null   object 
 3   Skills           5000 non-null   object 
 4   YearsExperience  5000 non-null   flo

Map: 100%|████████████████████████| 5440/5440 [00:00<00:00, 22139.39 examples/s]
Map: 100%|████████████████████████| 1360/1360 [00:00<00:00, 23968.16 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,5.0671
100,4.9175
150,4.6103
200,4.2453
250,3.8918
300,3.5625
350,3.262
400,2.8954
450,2.6159
500,2.3411



[Eval Metrics]: {'eval_loss': 0.012685646302998066, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 27.9893, 'eval_samples_per_second': 48.59, 'eval_steps_per_second': 1.536, 'epoch': 10.0}

Accuracy: 1.0000

Classification Report:
                                 precision    recall  f1-score   support

                         3D Art       1.00      1.00      1.00         6
              AI / Data Science       1.00      1.00      1.00        18
                    AI Research       1.00      1.00      1.00         7
              AR/VR Development       1.00      1.00      1.00         7
                     Accounting       1.00      1.00      1.00         6
              Actuarial Science       1.00      1.00      1.00         6
          Aerospace Engineering       1.00      1.00      1.00         6
                       AgriTech       1.00      1.00      1.00         7
                       Agronomy       1.00      1.00      1.

In [2]:
# === Top-7 plots by loading CSV only (no training) ===
import os, re, numpy as np, pandas as pd
import matplotlib.pyplot as plt

DATASET_PATH = "/Users/reemabalharith/Desktop/Capston Project/english_cv_dataset_full_6800_enriched.csv"  # أو العربية
PLOTS_DIR = "plots_only"
os.makedirs(PLOTS_DIR, exist_ok=True)

POSSIBLE_LABEL_COLS = ["Category","category","Label","label","target"]

def load_df(path):
    df = pd.read_csv(path)
    # نظافة بسيطة
    if "Cleaned_Resume" in df.columns:
        col = "Cleaned_Resume"
        df[col] = (df[col].astype(str)
                   .str.replace(r"<[^>]+>", " ", regex=True)
                   .str.replace("\\r"," ").str.replace("\\n"," ")
                   .str.replace("\r"," ").str.replace("\n"," "))
    return df

def find_label_col(df):
    for c in POSSIBLE_LABEL_COLS:
        if c in df.columns: return c
    raise ValueError(f"No label column found. Available: {df.columns.tolist()}")

def _top_counts(df, label_col, topk=7):
    return df[label_col].value_counts().nlargest(topk)

def plot_bar_top7(df, label_col, out_path, topk=7):
    top = _top_counts(df, label_col, topk)
    plt.figure(figsize=(12,6))
    plt.bar(np.arange(len(top)), top.values)
    for i, v in enumerate(top.values):
        plt.text(i, v, str(v), ha='center', va='bottom', fontsize=10)
    plt.title(f"Top {topk} Categories (Bar)")
    plt.xlabel("Category"); plt.ylabel("Count")
    plt.xticks(np.arange(len(top)), top.index.tolist(), rotation=45, ha="right")
    plt.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout(); plt.savefig(out_path, dpi=200); plt.close()

def plot_hbar_top7(df, label_col, out_path, topk=7):
    top = _top_counts(df, label_col, topk)[::-1]
    plt.figure(figsize=(12,6))
    plt.barh(np.arange(len(top)), top.values)
    for i, v in enumerate(top.values):
        plt.text(v, i, str(v), va='center', ha='left', fontsize=10)
    plt.title(f"Top {topk} Categories (Horizontal Bar)")
    plt.xlabel("Count"); plt.ylabel("Category")
    plt.yticks(np.arange(len(top)), top.index.tolist())
    plt.grid(axis='x', linestyle='--', alpha=0.3)
    plt.tight_layout(); plt.savefig(out_path, dpi=200); plt.close()

def plot_pie_top7(df, label_col, out_path, topk=7):
    counts = df[label_col].value_counts()
    top = counts.nlargest(topk)
    others = counts.sum() - top.sum()
    labels = top.index.tolist()
    sizes  = top.tolist()
    if others > 0:
        labels += ["Others"]; sizes += [others]
    plt.figure(figsize=(8,8))
    plt.pie(sizes, labels=labels, autopct="%1.1f%%", pctdistance=0.8, startangle=140)
    plt.title(f"Category Distribution (Top {topk})")
    plt.tight_layout(); plt.savefig(out_path, dpi=200); plt.close()

df = load_df(DATASET_PATH)
label_col = find_label_col(df)

plot_bar_top7(df, label_col, os.path.join(PLOTS_DIR, "category_bar_top7.png"))
plot_hbar_top7(df, label_col, os.path.join(PLOTS_DIR, "category_hbar_top7.png"))
plot_pie_top7(df, label_col, os.path.join(PLOTS_DIR, "category_pie_top7.png"))
