Model avec faux profils fait sur la base des codes romes

Importations et S3 au début
Modèle Transformer clair
Fonctions utilitaires S3 centralisées
Génération et upload des faux profils
Chargement des modèles et des profils
Fonction de prédiction hybride
Boucle de prédiction lisible avec affichage complet

In [11]:
# ---------------------------
# 1. Importation des bibliothèques
# ---------------------------
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import math
import os
from dotenv import load_dotenv
import boto3
import io
import tempfile
import joblib

print("Bibliothèques importées")

# ---------------------------
# 2. Connexion à Supabase via API S3-compatible
# ---------------------------
if not load_dotenv('.env'):
    print("Load env from alternative path")
    load_dotenv('.env')

if __debug__:
    print('Debug ON')
    print(
        "Environment data:",
        "\nS3_ENDPOINT_URL:", os.getenv("S3_ENDPOINT_URL"),
        "\nS3_ACCESS_KEY_ID (len):", len(os.getenv("S3_ACCESS_KEY_ID", "")), "bytes",
        "\nS3_SECRET_ACCESS_KEY (len):", len(os.getenv("S3_SECRET_ACCESS_KEY", "")), "bytes",
        "\nS3_REGION:", os.getenv("S3_REGION")
    )

try:
    s3_client = boto3.client(
        service_name='s3',
        region_name=os.getenv("S3_REGION"),
        endpoint_url=os.getenv("S3_ENDPOINT_URL"),
        aws_access_key_id=os.getenv("S3_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("S3_SECRET_ACCESS_KEY")
    )
except Exception as ex:
    print("Erreur création client S3 :", ex)

# ---------------------------
# 3. Définition du modèle Transformer
# ---------------------------
class JobProfileTransformer(nn.Module):
    def __init__(self, n_skills, n_jobs, emb_dim=64, n_heads=4, n_layers=2, max_len=88):
        super().__init__()
        self.skill_emb = nn.Embedding(n_skills, emb_dim)
        self.pos_emb = nn.Parameter(torch.randn(1, max_len, emb_dim))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim,
            nhead=n_heads,
            dim_feedforward=256,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.job_emb = nn.Embedding(n_jobs, emb_dim)

    def encode_profile(self, skills, weights=None):
        batch_size, seq_len = skills.shape
        if seq_len > self.pos_emb.size(1):
            pos_emb = self.pos_emb.repeat(1, math.ceil(seq_len / self.pos_emb.size(1)), 1)[:, :seq_len, :]
        else:
            pos_emb = self.pos_emb[:, :seq_len, :]
        skills_emb = self.skill_emb(skills) + pos_emb
        if weights is not None:
            skills_emb = skills_emb * weights.unsqueeze(-1)
        mask = (skills == 0)
        v = self.encoder(skills_emb, src_key_padding_mask=mask)
        v = v.mean(dim=1)
        return F.normalize(v, dim=1)

    def encode_job(self, job_ids):
        v = self.job_emb(job_ids)
        return F.normalize(v, dim=1)

print("Modèle Transformer défini")

# ---------------------------
# 4. Fonctions utilitaires S3
# ---------------------------
def read_csv_from_s3(file_name, bucket_name="dlhybride"):
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=file_name)
        df = pd.read_csv(io.BytesIO(response["Body"].read()), encoding="utf-8", dtype=str)
        print(f"Fichier '{file_name}' chargé depuis '{bucket_name}'")
        return df
    except Exception as e:
        print(f"Erreur lecture '{file_name}' depuis '{bucket_name}':", e)
        return None

def upload_df_to_s3(df, file_name="fake_profiles.csv", bucket_name="dlhybride"):
    try:
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False, encoding="utf-8")
        file_bytes = csv_buffer.getvalue().encode("utf-8")
        existing_files = s3_client.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in existing_files and any(f['Key'] == file_name for f in existing_files['Contents']):
            s3_client.delete_object(Bucket=bucket_name, Key=file_name)
        s3_client.put_object(
            Bucket=bucket_name,
            Key=file_name,
            Body=file_bytes,
            ContentType="text/csv"
        )
        print(f"Fichier '{file_name}' uploadé dans '{bucket_name}'")
    except Exception as ex:
        print("Erreur upload S3 :", ex)

def load_model_from_s3(file_name="modele_epoch4000.pkl", bucket_name="dlhybride"):##################################################
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=file_name)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as tmp_file:
            tmp_file.write(response["Body"].read())
            tmp_model_path = tmp_file.name
        model_loaded = joblib.load(tmp_model_path)
        print(f"Modèle '{file_name}' chargé depuis S3 :", type(model_loaded))
        return model_loaded
    except Exception as e:
        raise Exception(f"Erreur chargement modèle '{file_name}' :", e)

def load_fake_profiles_from_s3(file_name="fake_profiles.csv", bucket_name="dlhybride"):
    try:
        existing_files = s3_client.list_objects_v2(Bucket=bucket_name)
        if "Contents" not in existing_files or not any(f["Key"] == file_name for f in existing_files["Contents"]):
            print(f"Le fichier '{file_name}' n'existe pas dans '{bucket_name}'")
            return None
        response = s3_client.get_object(Bucket=bucket_name, Key=file_name)
        df = pd.read_csv(io.BytesIO(response["Body"].read()), dtype=str)
        if "skill_code" in df.columns:
            df["skill_code"] = df["skill_code"].str.strip()
        print(f"Fichier '{file_name}' chargé et nettoyé depuis '{bucket_name}'")
        return df
    except Exception as ex:
        print("Erreur lecture S3 :", ex)
        return None

# ---------------------------
# 5. Chargement des données métiers / compétences
# ---------------------------
df_jobs = read_csv_from_s3("df_competence_rome_eda_v2.csv")
if df_jobs is None:
    raise FileNotFoundError("Impossible de charger df_competence_rome_eda_v2.csv depuis S3")

df_jobs['code_ogr_competence'] = df_jobs['code_ogr_competence'].astype(str).str.strip()

skills_vocab = {code: idx for idx, code in enumerate(df_jobs['code_ogr_competence'].unique())}
skill_to_label = df_jobs.drop_duplicates('code_ogr_competence') \
    .set_index('code_ogr_competence')['libelle_competence'].to_dict()
jobs_vocab = {rome: idx for idx, rome in enumerate(df_jobs['code_rome'].unique())}
job_labels = df_jobs.drop_duplicates('code_rome').set_index('code_rome')['libelle_rome'].to_dict()
job_to_skills = df_jobs.groupby('code_rome')['code_ogr_competence'].apply(set).to_dict()

# ---------------------------
# 6. Génération de faux profils
# ---------------------------
def generate_fake_profile(job_code, min_ratio=0.5, max_ratio=0.9):
    skills = list(job_to_skills[job_code])
    keep_ratio = random.uniform(min_ratio, max_ratio)
    n_keep = max(1, int(len(skills) * keep_ratio))
    return random.sample(skills, n_keep)

fake_profiles_list = []
for _ in range(2):
    job = random.choice(list(job_to_skills.keys()))
    selected_skills = generate_fake_profile(job)
    fake_profiles_list.append({
        "job_code": job,
        "job_label": job_labels.get(job, "?"),
        "skills": selected_skills,
        "skills_labels": [skill_to_label.get(s, "?") for s in selected_skills]
    })

rows = []
for i, profile in enumerate(fake_profiles_list, 1):
    for skill, label in zip(profile["skills"], profile["skills_labels"]):
        rows.append({
            "profile_id": i,
            "job_code": profile["job_code"],
            "job_label": profile["job_label"],
            "skill_code": skill,
            "skill_label": label
        })
df_profiles = pd.DataFrame(rows)
upload_df_to_s3(df_profiles, "fake_profiles.csv")

# ---------------------------
# 7. Chargement du modèle et des faux profils
# ---------------------------
model_loaded = load_model_from_s3("modele_epoch4000.pkl")#####################################
fake_profiles = load_fake_profiles_from_s3("fake_profiles.csv")
if fake_profiles is None:
    raise FileNotFoundError("Impossible de charger fake_profiles.csv depuis S3")

skills_vocab_clean = {k.strip(): v for k, v in skills_vocab.items()}

# ---------------------------
# 8. Fonction de prédiction hybride
# ---------------------------
def predict_hybrid(model, input_skills, skills_vocab, job_to_skills, jobs_vocab, job_labels, top_k=3, seuil=0.3, min_overlap=2):
    device = next(model.parameters()).device
    ids = [skills_vocab[s] for s in input_skills if s in skills_vocab]
    if len(ids) == 0:
        return "Indéfini (aucune compétence reconnue)"
    
    skills_tensor = torch.tensor(ids).unsqueeze(0).to(device)
    weights = torch.tensor([1.0]*len(ids), dtype=torch.float).unsqueeze(0).to(device)
    v_p = model.encode_profile(skills_tensor, weights)
    all_jobs = torch.arange(len(jobs_vocab)).to(device)
    v_j = model.encode_job(all_jobs)
    
    # Similarité DL
    scores_dl = (v_p @ v_j.T).squeeze(0)
    
    # Heuristique overlap
    input_set = set(input_skills)
    overlap_scores_list = [len(input_set & job_to_skills.get(j, set())) for j in jobs_vocab.keys()]
    overlap_scores = torch.tensor(overlap_scores_list, device=device)
    
    combined_scores = 0.3*scores_dl + 0.7*(overlap_scores / max(1, max(overlap_scores)))
    
    mask = overlap_scores >= min_overlap
    filtered_indices = torch.arange(len(jobs_vocab), device=device)[mask]
    filtered_scores = combined_scores[mask]
    
    if len(filtered_scores) == 0:
        return "Indéfini (aucune compétence ne passe le filtre)"
    
    best_scores, best_idx = filtered_scores.topk(min(top_k, len(filtered_scores)))
    best_jobs = [list(jobs_vocab.keys())[i] for i in filtered_indices[best_idx]]
    
    lines = []
    for rome, s in zip(best_jobs, best_scores):
        libelle = job_labels.get(rome, "?")
        lines.append(f"{rome} - {libelle} - {round(float(s.detach().cpu())*100,1)}%")
    
    if best_scores[0] < seuil:
        return "Indéfini\n" + "\n".join(lines)
    
    return "\n".join(lines)
'''
# ---------------------------
# 9. Boucle de prédiction sur les profils
# ---------------------------
for profile_id in fake_profiles['profile_id'].unique():
    subset = fake_profiles[fake_profiles['profile_id'] == profile_id]
    user_skills = subset['skill_code'].tolist()
    expected_job = subset['job_code'].iloc[0] if 'job_code' in subset.columns else None
    
    recognized_skills = [s for s in user_skills if s in skills_vocab_clean]
    unrecognized_skills = [s for s in user_skills if s not in skills_vocab_clean]
    user_skills_named = [f"{c} - {skill_to_label.get(c,'?')}" for c in recognized_skills]
    
    print("="*50)
    print(f"Profil {profile_id} → compétences ({len(user_skills)}):")
    for s in user_skills_named:
        print(f" • {s}")
    if unrecognized_skills:
        print(f"Compétences non reconnues ({len(unrecognized_skills)}): {unrecognized_skills}")
    if expected_job:
        print(f"\nMétier attendu : {expected_job} - {job_labels.get(expected_job,'?')}")
    print("\nTop-3 métiers proposés :")
    
    try:
        prediction = predict_hybrid(
            model_loaded, recognized_skills, skills_vocab_clean,
            job_to_skills, jobs_vocab, job_labels, top_k=3
        )
        print(prediction)
    except Exception as e:
        print(f"Erreur lors de la prédiction : {e}")

print("\nPrédictions terminées\n")
'''
# ---------------------------
# 9. Boucle de prédiction sur les profils
# ---------------------------
for profile_id in fake_profiles['profile_id'].unique():
    subset = fake_profiles[fake_profiles['profile_id'] == profile_id]
    user_skills = subset['skill_code'].tolist()
    expected_job = subset['job_code'].iloc[0] if 'job_code' in subset.columns else None
    
    recognized_skills = [s for s in user_skills if s in skills_vocab_clean]
    unrecognized_skills = [s for s in user_skills if s not in skills_vocab_clean]
    user_skills_named = [f"{c} - {skill_to_label.get(c,'?')}" for c in recognized_skills]
    
    print("="*60)
    print(f"Profil {profile_id} → compétences ({len(user_skills)}):")
    for s in user_skills_named:
        print(f" • {s}")
    if unrecognized_skills:
        print(f"Compétences non reconnues ({len(unrecognized_skills)}): {unrecognized_skills}")
    if expected_job:
        print(f"\nMétier attendu : {expected_job} - {job_labels.get(expected_job,'?')}")

    print("\nTop-3 métiers proposés :")
    
    try:
        prediction = predict_hybrid(
            model_loaded, recognized_skills, skills_vocab_clean,
            job_to_skills, jobs_vocab, job_labels, top_k=3
        )

        # Séparer en lignes si c'est une seule chaîne
        if isinstance(prediction, str):
            prediction_lines = prediction.strip().split("\n")
        else:
            prediction_lines = prediction

        for line in prediction_lines:
            if not line.strip():
                continue  # ignorer lignes vides
            job_code = line.split(" - ")[0].strip()
            job_skills = job_to_skills.get(job_code, set())
            common_skills = set(recognized_skills) & set(job_skills)

            print(f"{line} | corrélation = {len(common_skills)} compétences")
            if common_skills:
                common_named = [f"{c} - {skill_to_label.get(c, '?')}" for c in common_skills]
                print("   Compétences en commun :")
                for cs in sorted(common_named):
                    print(f"     • {cs}")

    except Exception as e:
        print(f"Erreur lors de la prédiction : {e}")

print("\nPrédictions terminées\n")







Bibliothèques importées
Debug ON
Environment data: 
S3_ENDPOINT_URL: https://bhckzdwrhhfaxbidmwpm.supabase.co/storage/v1/s3 
S3_ACCESS_KEY_ID (len): 32 bytes 
S3_SECRET_ACCESS_KEY (len): 64 bytes 
S3_REGION: eu-west-3
Modèle Transformer défini
Fichier 'df_competence_rome_eda_v2.csv' chargé depuis 'dlhybride'
Fichier 'fake_profiles.csv' uploadé dans 'dlhybride'
Modèle 'modele_epoch4000.pkl' chargé depuis S3 : <class '__main__.JobProfileTransformer'>
Fichier 'fake_profiles.csv' chargé et nettoyé depuis 'dlhybride'
Profil 1 → compétences (24):
 • 117585 - Contrôler les conditions de stockage des produits
 • 404877 - Optimiser l'utilisation des ressources en eau
 • 482836 - Assurer la traçabilité des produits
 • 123046 - Surveiller l'état d'une plantation
 • 482927 - Assurer le suivi des commandes et la gestion des stocks
 • 483520 - Appliquer les normes d'hygiène et de sécurité alimentaire
 • 503014 - Sélectionner et appliquer des engrais organiques
 • 100256 - Dispenser les soins prévent