In [4]:
# -*- coding: utf-8 -*-
"""
Requirements (once):
  pip install arabic-reshaper python-bidi wordcloud matplotlib pandas numpy scikit-learn transformers datasets accelerate torch
"""

import os, re, sys, json, warnings, io
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# ===== Arabic shaping & plotting =====
import matplotlib
import matplotlib.pyplot as plt
import arabic_reshaper
from bidi.algorithm import get_display

def ar_text(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    return get_display(arabic_reshaper.reshape(s))

# ⚠️ حدّدي اسم ملف الخط الموجود في نفس المجلد:
FONT_PATH = os.path.abspath("/Users/reemabalharith/Desktop/Capston Project/Amiri-Regular.ttf")  # <-- غيّري الاسم لو استخدمتِ خطًا آخر
print("Arabic FONT_PATH ->", FONT_PATH, "exists:", os.path.exists(FONT_PATH))

# إجبار matplotlib على خط يدعم العربية (للعناوين والمحاور)
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Cairo', 'Amiri', 'Noto Naskh Arabic', 'Tahoma', 'Arial', 'DejaVu Sans']
matplotlib.rcParams['axes.unicode_minus'] = False

from wordcloud import WordCloud
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)

# =================== الإعدادات ===================
DATASET_PATH = os.environ.get(
    "RESUME_DATASET_PATH_AR",
    "/Users/reemabalharith/Desktop/Capston Project/arabic_cv_dataset_300.csv"
)
RANDOM_STATE = 42
TEST_SIZE_DEFAULT = 0.2
PLOTS_DIR = "plots_ar"
MODEL_DIR  = "llm_model_ar"

# موديل عربي خفيف على الذاكرة
MODEL_NAME  = "asafaya/bert-mini-arabic"
MAX_LEN     = 192
EPOCHS      = 4
BATCH_TRAIN = 2
BATCH_EVAL  = 4
LR          = 2e-5
GRAD_ACCUM  = 4

os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# ========= دوال مساعدة =========
POSSIBLE_LABEL_COLS = ["Category", "category", "Label", "label", "target"]
POSSIBLE_TEXT_COLS  = ["Cleaned_Resume", "Resume", "summary", "Summary", "text", "body", "description"]

def حمّل_البيانات(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        print(f"[خطأ] لم يتم العثور على الملف: {path}")
        sys.exit(1)
    return pd.read_csv(path)

def اكتشف_الأعمدة(df: pd.DataFrame):
    label_col = next((c for c in POSSIBLE_LABEL_COLS if c in df.columns), None)
    if label_col is None:
        raise SystemExit(f"[خطأ] لم أجد عمود الفئات. الأعمدة: {df.columns.tolist()}")
    text_col = next((c for c in POSSIBLE_TEXT_COLS if c in df.columns), None)
    if text_col is None:
        parts = [c for c in [
            "summary","skills","tools","projects","education_degree","education_university",
            "education_year","years_experience","last_company","job_title","city","keywords","name"
        ] if c in df.columns]
        if not parts:
            raise SystemExit("[خطأ] لا يوجد عمود نصي مناسب.")
        df["_COMBINED_TEXT_"] = df[parts].astype(str).fillna("").agg(" ".join, axis=1)
        text_col = "_COMBINED_TEXT_"
    return label_col, text_col

def نظّف_النص(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = re.sub(r"<[^>]+>", " ", text)
    text = text.replace("â€¢", "•")
    text = text.replace("\\r", " ").replace("\\n", " ")
    text = text.replace("\r", " ").replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ======= دوال الرسم (مع ar_text) =======
def ارسم_أعمدة_الفئات(df: pd.DataFrame, label_col: str, out_path: str):
    try:
        plt.figure(figsize=(12, 6))
        counts = df[label_col].value_counts()
        ax = counts.plot(kind="bar", color="#38bdf8")
        ax.set_title(ar_text("عدد السير الذاتية لكل فئة"))
        ax.set_xlabel(ar_text("الفئة"))
        ax.set_ylabel(ar_text("العدد"))
        ax.set_xticklabels([ar_text(t.get_text()) for t in ax.get_xticklabels()], rotation=45, ha="right")
        plt.tight_layout()
        plt.savefig(out_path, dpi=150)
        plt.close()
    except Exception as e:
        print("[تحذير] تعذّر الرسم (أعمدة):", e)

def ارسم_دائري_الفئات(df: pd.DataFrame, label_col: str, out_path: str, topk=7):
    try:
        counts = df[label_col].value_counts()
        top = counts.nlargest(topk)
        others = counts.sum() - top.sum()
        labels = [ar_text(s) for s in top.index.tolist()] + [ar_text("أخرى")]
        sizes = top.tolist() + [others]
        plt.figure(figsize=(8, 8))
        plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140)
        plt.title(ar_text(f"توزيع الفئات (أعلى {topk})"))
        plt.tight_layout()
        plt.savefig(out_path, dpi=150)
        plt.close()
    except Exception as e:
        print("[تحذير] تعذّر الرسم (دائري):", e)

def reshape_freq_for_ar(freq_counter):
    new_freq = {}
    for k, v in freq_counter.items():
        try:
            new_freq[ar_text(k)] = v
        except Exception:
            new_freq[str(k)] = v
    return new_freq


# =================== Main ===================
def main():
    print("[1] تحميل:", DATASET_PATH)
    df = حمّل_البيانات(DATASET_PATH)
    label_col, text_col = اكتشف_الأعمدة(df)

    print("\n[2] إزالة التكرارات…")
    df = df.drop_duplicates().copy()

    print("\n[3] تنظيف النصوص…")
    df["Cleaned_Resume"] = df[text_col].apply(نظّف_النص)

    print("\n[4] توزيع الفئات:")
    print(df[label_col].value_counts())
    ارسم_أعمدة_الفئات(df, label_col, os.path.join(PLOTS_DIR, "category_bar.png"))
    ارسم_دائري_الفئات(df, label_col, os.path.join(PLOTS_DIR, "category_pie.png"))

    # ====== تقسيم آمن: دمج الفئات النادرة إلى Other وتعديل test_size ======
    X_all = df["Cleaned_Resume"].copy()
    MIN_PER_CLASS = 2
    counts = df[label_col].value_counts()
    rare = counts[counts < MIN_PER_CLASS].index.tolist()
    y_all = df[label_col].apply(lambda z: "Other" if z in rare else z)

    unique_labels = sorted(pd.Index(y_all.unique()).tolist())
    use_stratify = len(unique_labels) > 1
    TEST_SIZE = TEST_SIZE_DEFAULT
    if use_stratify:
        n_classes = len(unique_labels); N = len(y_all)
        while int(round((1 - TEST_SIZE) * N)) < n_classes and TEST_SIZE > 0.01:
            TEST_SIZE = max(0.01, TEST_SIZE - 0.05)
            print(f"[INFO] تقليل test_size -> {TEST_SIZE:.2f} لضمان train_size ≥ عدد الفئات ({n_classes})")

    if use_stratify:
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_all
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True
        )

    print("\n[Check] train counts:\n", y_train.value_counts())
    print("\n[Check] test  counts:\n", y_test.value_counts())

    # ====== HF datasets ======
    labels = sorted(pd.Index(y_all.unique()).tolist())
    label2id = {lbl: i for i, lbl in enumerate(labels)}
    id2label = {i: lbl for i, lbl in enumerate(labels)}

    train_df = pd.DataFrame({"text": X_train.values, "label": [label2id[v] for v in y_train.values]})
    test_df  = pd.DataFrame({"text": X_test.values,  "label": [label2id[v] for v in y_test.values]})
    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    test_ds  = Dataset.from_pandas(test_df,  preserve_index=False)

    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
    def tok_fn(batch):
        return tok(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

    train_tok = train_ds.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
    test_tok  = test_ds.map(tok_fn,  batched=True).remove_columns(["text"]).with_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
    )

    args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIR, "outputs"),
        learning_rate=LR,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=EPOCHS,
        logging_steps=50,
        seed=RANDOM_STATE,
        gradient_accumulation_steps=GRAD_ACCUM,
    )

    def compute_metrics(eval_pred):
        logits, y_true_ids = eval_pred
        y_pred_ids = np.argmax(logits, axis=-1)
        acc = accuracy_score(y_true_ids, y_pred_ids)
        p, r, f1, _ = precision_recall_fscore_support(y_true_ids, y_pred_ids, average="weighted", zero_division=0)
        return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=test_tok,
        compute_metrics=compute_metrics,
    )

    print("\n[7] بدء التدريب…")
    trainer.train()

    print("\n[8] التقييم…")
    eval_res = trainer.evaluate()
    print(eval_res)

    preds = np.argmax(trainer.predict(test_tok).predictions, axis=-1)
    y_true = [id2label[i] for i in test_df["label"]]
    y_pred = [id2label[i] for i in preds]
    print(f"\nالدقّة: {accuracy_score(y_true, y_pred):.4f}")
    print("\nتقرير التصنيف:")
    print(classification_report(y_true, y_pred, zero_division=0))

    print("\n[9] حفظ المخرجات")
    model.save_pretrained(MODEL_DIR)
    tok.save_pretrained(MODEL_DIR)
    with open(os.path.join(MODEL_DIR, "labels.json"), "w", encoding="utf-8") as f:
        json.dump({"labels": labels}, f, ensure_ascii=False, indent=2)
    print(f"تم الحفظ -> {MODEL_DIR}")

if __name__ == "__main__":
    main()


Arabic FONT_PATH -> /Users/reemabalharith/Desktop/Capston Project/Amiri-Regular.ttf exists: True
[1] تحميل: /Users/reemabalharith/Desktop/Capston Project/arabic_cv_dataset_300.csv

[2] إزالة التكرارات…

[3] تنظيف النصوص…

[4] توزيع الفئات:
Category
تسويق رقمي                   39
هندسة برمجيات                39
ذكاء اصطناعي / علم بيانات    37
مالية / محاسبة               30
شبكات                        30
مبيعات                       30
أمن سيبراني                  29
موارد بشرية                  24
DevOps                       22
إدارة مشاريع                 20
Name: count, dtype: int64

[Check] train counts:
 Category
تسويق رقمي                   31
هندسة برمجيات                31
ذكاء اصطناعي / علم بيانات    30
مبيعات                       24
مالية / محاسبة               24
شبكات                        24
أمن سيبراني                  23
موارد بشرية                  19
DevOps                       18
إدارة مشاريع                 16
Name: count, dtype: int64

[Check] test  counts:
 Ca

Map: 100%|██████████████████████████| 240/240 [00:00<00:00, 17332.43 examples/s]
Map: 100%|████████████████████████████| 60/60 [00:00<00:00, 14378.01 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mini-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[7] بدء التدريب…


Step,Training Loss
50,2.2152
100,1.981



[8] التقييم…


{'eval_loss': 1.811218023300171, 'eval_accuracy': 0.95, 'eval_precision': 0.9601851851851853, 'eval_recall': 0.95, 'eval_f1': 0.9479469060351413, 'eval_runtime': 0.2488, 'eval_samples_per_second': 241.2, 'eval_steps_per_second': 60.3, 'epoch': 4.0}

الدقّة: 0.9500

تقرير التصنيف:
                           precision    recall  f1-score   support

                   DevOps       1.00      1.00      1.00         4
              أمن سيبراني       1.00      0.83      0.91         6
             إدارة مشاريع       1.00      1.00      1.00         4
               تسويق رقمي       0.89      1.00      0.94         8
ذكاء اصطناعي / علم بيانات       1.00      1.00      1.00         7
                    شبكات       1.00      1.00      1.00         6
           مالية / محاسبة       1.00      1.00      1.00         6
                   مبيعات       0.75      1.00      0.86         6
              موارد بشرية       1.00      0.60      0.75         5
            هندسة برمجيات       1.00      1.00  