In [1]:
# -*- coding: utf-8 -*-
"""
Requirements (once):
  pip install arabic-reshaper python-bidi wordcloud matplotlib pandas numpy scikit-learn transformers datasets accelerate torch
"""

import os, re, sys, json, warnings, io
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# ===== Arabic shaping & plotting =====
import matplotlib
import matplotlib.pyplot as plt
import arabic_reshaper
from bidi.algorithm import get_display
from matplotlib import font_manager as fm

def ar_text(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    return get_display(arabic_reshaper.reshape(s))

# ⚠️ ملف الخط (اختياري لكن يجمّل العربية)
FONT_PATH = os.path.abspath("/Users/reemabalharith/Desktop/Capston Project/Amiri-Regular.ttf")
print("Arabic FONT_PATH ->", FONT_PATH, "exists:", os.path.exists(FONT_PATH))
if os.path.exists(FONT_PATH):
    try:
        fm.fontManager.addfont(FONT_PATH)
        matplotlib.rcParams['font.family'] = 'Amiri'
    except Exception as _:
        pass

# خط افتراضي احتياطي
matplotlib.rcParams.setdefault('font.family', 'sans-serif')
matplotlib.rcParams.setdefault('font.sans-serif', ['Cairo','Amiri','Noto Naskh Arabic','Tahoma','Arial','DejaVu Sans'])
matplotlib.rcParams['axes.unicode_minus'] = False

from wordcloud import WordCloud
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)

# =================== الإعدادات ===================
DATASET_PATH = os.environ.get(
    "RESUME_DATASET_PATH_AR",
    "/Users/reemabalharith/Desktop/Capston Project/arabic_cv_dataset_full_6800_enriched.csv"
)
RANDOM_STATE = 42
TEST_SIZE_DEFAULT = 0.2
PLOTS_DIR = "plots_ar"
MODEL_DIR  = "llm_model_ar"

# موديل عربي خفيف على الذاكرة
MODEL_NAME  = "asafaya/bert-mini-arabic"
MAX_LEN     = 192
EPOCHS      = 10
BATCH_TRAIN = 2
BATCH_EVAL  = 4
LR          = 2e-5
GRAD_ACCUM  = 4

os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# ========= دوال مساعدة =========
POSSIBLE_LABEL_COLS = ["Category", "category", "Label", "label", "target"]
POSSIBLE_TEXT_COLS  = ["Cleaned_Resume", "Resume", "summary", "Summary", "text", "body", "description"]

def حمّل_البيانات(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        print(f"[خطأ] لم يتم العثور على الملف: {path}")
        sys.exit(1)
    return pd.read_csv(path)

def اكتشف_الأعمدة(df: pd.DataFrame):
    label_col = next((c for c in POSSIBLE_LABEL_COLS if c in df.columns), None)
    if label_col is None:
        raise SystemExit(f"[خطأ] لم أجد عمود الفئات. الأعمدة: {df.columns.tolist()}")
    text_col = next((c for c in POSSIBLE_TEXT_COLS if c in df.columns), None)
    if text_col is None:
        parts = [c for c in [
            "summary","skills","tools","projects","education_degree","education_university",
            "education_year","years_experience","last_company","job_title","city","keywords","name"
        ] if c in df.columns]
        if not parts:
            raise SystemExit("[خطأ] لا يوجد عمود نصي مناسب.")
        df["_COMBINED_TEXT_"] = df[parts].astype(str).fillna("").agg(" ".join, axis=1)
        text_col = "_COMBINED_TEXT_"
    return label_col, text_col

def نظّف_النص(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = re.sub(r"<[^>]+>", " ", text)
    text = text.replace("â€¢", "•")
    text = text.replace("\\r", " ").replace("\\n", " ")
    text = text.replace("\r", " ").replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ======= دوال الرسم (Top-7) =======
def _top_counts(df: pd.DataFrame, label_col: str, topk: int = 7) -> pd.Series:
    counts = df[label_col].value_counts()
    return counts.nlargest(topk)

def ارسم_أعمدة_الفئات(df: pd.DataFrame, label_col: str, out_path: str, topk: int = 7):
    """Bar عمودي للتوب 7"""
    try:
        top = _top_counts(df, label_col, topk)
        labels_ar = [ar_text(s) for s in top.index.tolist()]
        plt.figure(figsize=(12, 6))
        ax = plt.gca()
        ax.bar(np.arange(len(top)), top.values)
        ax.set_title(ar_text(f"أعلى {topk} فئات (عمودي)"))
        ax.set_xlabel(ar_text("الفئة"))
        ax.set_ylabel(ar_text("العدد"))
        ax.set_xticks(np.arange(len(top)))
        ax.set_xticklabels(labels_ar, rotation=45, ha="right")
        ax.grid(axis='y', linestyle='--', alpha=0.3)
        plt.tight_layout()
        plt.savefig(out_path, dpi=200)
        plt.close()
    except Exception as e:
        print("[تحذير] تعذّر الرسم (أعمدة):", e)

def ارسم_أعمدة_أفقية(df: pd.DataFrame, label_col: str, out_path: str, topk: int = 7):
    """Bar أفقي للتوب 7 (ممتاز لأسماء طويلة)"""
    try:
        top = _top_counts(df, label_col, topk)[::-1]  # عكس الترتيب ليظهر الأكبر بالأعلى بعد الرسم الأفقي
        labels_ar = [ar_text(s) for s in top.index.tolist()]
        plt.figure(figsize=(12, 6))
        ax = plt.gca()
        ax.barh(np.arange(len(top)), top.values)
        ax.set_title(ar_text(f"أعلى {topk} فئات (أفقي)"))
        ax.set_xlabel(ar_text("العدد"))
        ax.set_ylabel(ar_text("الفئة"))
        ax.set_yticks(np.arange(len(top)))
        ax.set_yticklabels(labels_ar)
        ax.grid(axis='x', linestyle='--', alpha=0.3)
        plt.tight_layout()
        plt.savefig(out_path, dpi=200)
        plt.close()
    except Exception as e:
        print("[تحذير] تعذّر الرسم (أفقي):", e)

def ارسم_دائري_الفئات(df: pd.DataFrame, label_col: str, out_path: str, topk: int = 7):
    """Pie للتوب 7 + تجميع الباقي كـ 'أخرى'"""
    try:
        counts = df[label_col].value_counts()
        top = counts.nlargest(topk)
        others = counts.sum() - top.sum()
        labels = [ar_text(s) for s in top.index.tolist()]
        sizes = top.tolist()
        if others > 0:
            labels += [ar_text("أخرى")]
            sizes  += [others]
        plt.figure(figsize=(8, 8))
        plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140, pctdistance=0.8)
        plt.title(ar_text(f"توزيع الفئات (أعلى {topk})"))
        plt.tight_layout()
        plt.savefig(out_path, dpi=200)
        plt.close()
    except Exception as e:
        print("[تحذير] تعذّر الرسم (دائري):", e)

def احفظ_جدول_التوب7(df: pd.DataFrame, label_col: str, out_csv_path: str, topk: int = 7):
    """حفظ جدول التوب 7 كـ CSV"""
    top = _top_counts(df, label_col, topk).reset_index()
    top.columns = ["category", "count"]
    top["category_ar"] = top["category"].apply(ar_text)
    top.to_csv(out_csv_path, index=False, encoding="utf-8-sig")
    print(f"[Info] حفظ جدول التوب7 -> {out_csv_path}")

def reshape_freq_for_ar(freq_counter):
    new_freq = {}
    for k, v in freq_counter.items():
        try:
            new_freq[ar_text(k)] = v
        except Exception:
            new_freq[str(k)] = v
    return new_freq


# =================== Main ===================
def main():
    print("[1] تحميل:", DATASET_PATH)
    df = حمّل_البيانات(DATASET_PATH)
    label_col, text_col = اكتشف_الأعمدة(df)

    print("\n[2] إزالة التكرارات…")
    df = df.drop_duplicates().copy()

    print("\n[3] تنظيف النصوص…")
    df["Cleaned_Resume"] = df[text_col].apply(نظّف_النص)

    print("\n[4] توزيع الفئات (كامل):")
    print(df[label_col].value_counts())

    # ===== رسومات وتقرير التوب 7 =====
    ارسم_أعمدة_الفئات(df, label_col, os.path.join(PLOTS_DIR, "category_bar_top7.png"), topk=7)
    ارسم_أعمدة_أفقية(df, label_col, os.path.join(PLOTS_DIR, "category_hbar_top7.png"), topk=7)
    ارسم_دائري_الفئات(df, label_col, os.path.join(PLOTS_DIR, "category_pie_top7.png"), topk=7)
    احفظ_جدول_التوب7(df, label_col, os.path.join(PLOTS_DIR, "top7_table.csv"), topk=7)

    # ====== تقسيم آمن: دمج الفئات النادرة إلى Other وتعديل test_size ======
    X_all = df["Cleaned_Resume"].copy()
    MIN_PER_CLASS = 2
    counts = df[label_col].value_counts()
    rare = counts[counts < MIN_PER_CLASS].index.tolist()
    y_all = df[label_col].apply(lambda z: "Other" if z in rare else z)

    unique_labels = sorted(pd.Index(y_all.unique()).tolist())
    use_stratify = len(unique_labels) > 1
    TEST_SIZE = TEST_SIZE_DEFAULT
    if use_stratify:
        n_classes = len(unique_labels); N = len(y_all)
        while int(round((1 - TEST_SIZE) * N)) < n_classes and TEST_SIZE > 0.01:
            TEST_SIZE = max(0.01, TEST_SIZE - 0.05)
            print(f"[INFO] تقليل test_size -> {TEST_SIZE:.2f} لضمان train_size ≥ عدد الفئات ({n_classes})")

    if use_stratify:
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_all
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True
        )

    print("\n[Check] train counts:\n", y_train.value_counts())
    print("\n[Check] test  counts:\n", y_test.value_counts())

    # ====== HF datasets ======
    labels = sorted(pd.Index(y_all.unique()).tolist())
    label2id = {lbl: i for i, lbl in enumerate(labels)}
    id2label = {i: lbl for i, lbl in enumerate(labels)}

    train_df = pd.DataFrame({"text": X_train.values, "label": [label2id[v] for v in y_train.values]})
    test_df  = pd.DataFrame({"text": X_test.values,  "label": [label2id[v] for v in y_test.values]})
    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    test_ds  = Dataset.from_pandas(test_df,  preserve_index=False)

    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
    def tok_fn(batch):
        return tok(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

    train_tok = train_ds.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
    test_tok  = test_ds.map(tok_fn,  batched=True).remove_columns(["text"]).with_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
    )

    args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIR, "outputs"),
        learning_rate=LR,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=EPOCHS,
        logging_steps=50,
        seed=RANDOM_STATE,
        gradient_accumulation_steps=GRAD_ACCUM,
    )

    def compute_metrics(eval_pred):
        logits, y_true_ids = eval_pred
        y_pred_ids = np.argmax(logits, axis=-1)
        acc = accuracy_score(y_true_ids, y_pred_ids)
        p, r, f1, _ = precision_recall_fscore_support(y_true_ids, y_pred_ids, average="weighted", zero_division=0)
        return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=test_tok,
        compute_metrics=compute_metrics,
    )

    print("\n[7] بدء التدريب…")
    trainer.train()

    print("\n[8] التقييم…")
    eval_res = trainer.evaluate()
    print(eval_res)

    preds = np.argmax(trainer.predict(test_tok).predictions, axis=-1)
    y_true = [id2label[i] for i in test_df["label"]]
    y_pred = [id2label[i] for i in preds]
    print(f"\nالدقّة: {accuracy_score(y_true, y_pred):.4f}")
    print("\nتقرير التصنيف:")
    print(classification_report(y_true, y_pred, zero_division=0))

    print("\n[9] حفظ المخرجات")
    model.save_pretrained(MODEL_DIR)
    tok.save_pretrained(MODEL_DIR)
    with open(os.path.join(MODEL_DIR, "labels.json"), "w", encoding="utf-8") as f:
        json.dump({"labels": labels}, f, ensure_ascii=False, indent=2)
    print(f"تم الحفظ -> {MODEL_DIR}")

if __name__ == "__main__":
    main()


Arabic FONT_PATH -> /Users/reemabalharith/Desktop/Capston Project/Amiri-Regular.ttf exists: True
[1] تحميل: /Users/reemabalharith/Desktop/Capston Project/arabic_cv_dataset_full_6800_enriched.csv

[2] إزالة التكرارات…

[3] تنظيف النصوص…

[4] توزيع الفئات (كامل):
Category
أمن سيبراني          122
هندسة برمجيات        122
موارد بشرية          122
تسويق رقمي           122
صيدلة                122
                    ... 
مساعدة طبيب أسنان     32
فني بصريات            32
علاج وظيفي            32
معلم ابتدائي          32
دعم فني                8
Name: count, Length: 167, dtype: int64
[Info] حفظ جدول التوب7 -> plots_ar/top7_table.csv

[Check] train counts:
 Category
موارد بشرية        97
هندسة ميكانيكية    97
أمن سيبراني        97
تسويق رقمي         97
صيدلة              97
                   ..
هندسة صناعية       25
منظومات القدرة     25
نجار               25
لحام               25
دعم فني             6
Name: count, Length: 167, dtype: int64

[Check] test  counts:
 Category
تمريض             

Map: 100%|████████████████████████| 5440/5440 [00:00<00:00, 18967.60 examples/s]
Map: 100%|████████████████████████| 1360/1360 [00:00<00:00, 18903.84 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mini-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[7] بدء التدريب…


Step,Training Loss
50,5.1425
100,5.098
150,5.0371
200,4.9776
250,4.9367
300,4.8894
350,4.7632
400,4.6958
450,4.6155
500,4.5603



[8] التقييم…


{'eval_loss': 1.5259196758270264, 'eval_accuracy': 0.9830882352941176, 'eval_precision': 0.9716397687280041, 'eval_recall': 0.9830882352941176, 'eval_f1': 0.9762691304719562, 'eval_runtime': 6.4932, 'eval_samples_per_second': 209.45, 'eval_steps_per_second': 52.363, 'epoch': 10.0}

الدقّة: 0.9831

تقرير التصنيف:
                                    precision    recall  f1-score   support

                            DevOps       1.00      1.00      1.00        18
                    Lean Six Sigma       1.00      1.00      1.00         6
                           SEO/SEM       1.00      1.00      1.00         7
                       أحياء دقيقة       1.00      1.00      1.00         7
                              أشعة       1.00      1.00      1.00         7
                    أطعمة ومشروبات       1.00      1.00      1.00         6
                       أمن سيبراني       1.00      1.00      1.00        25
                        أنظمة تحكم       1.00      1.00      1.00         6
 