In [None]:
#Chargement des datasets source
import numpy as np
import requests
import io

base_url = "https://raw.githubusercontent.com/lmuxz/SCDA/master/data/"
suffixes = ["test", "test_label", "train", "train_label"]

# Dictionnaire pour stocker vos matrices numpy
data_store_source = {}


for suffix in suffixes:
    file_name = f"kaggle_source_cate_0_{suffix}.npy"
    url = f"{base_url}{file_name}"

    response = requests.get(url)

    if response.status_code == 200:
        # numpy.load nécessite un objet "file-like", d'où l'utilisation de BytesIO
        data_store_source[file_name] = np.load(io.BytesIO(response.content))
        print(f"Chargé : {file_name} | Forme : {data_store_source[file_name].shape}")
    else:
        print(f"Échec pour {file_name} (Code : {response.status_code})")

# Exemple d'accès :
# train_data_0 = data_store['kaggle_source_cate_0_train.npy']

Chargé : kaggle_source_cate_0_test.npy | Forme : (13686, 51)
Chargé : kaggle_source_cate_0_test_label.npy | Forme : (13686, 2)
Chargé : kaggle_source_cate_0_train.npy | Forme : (41058, 51)
Chargé : kaggle_source_cate_0_train_label.npy | Forme : (41058, 2)


In [None]:
# Passage en dataframe pandas des datasets source :
import pandas as pd

def create_dataframes(data_dict):
# 1. Extraction des matrices
    x_train = data_dict['kaggle_source_cate_0_train.npy']
    y_train = data_dict['kaggle_source_cate_0_train_label.npy']

    x_test = data_dict['kaggle_source_cate_0_test.npy']
    y_test = data_dict['kaggle_source_cate_0_test_label.npy']

    # 2. Conversion des features en DataFrames
    # On s'assure que les colonnes de features ont des noms clairs (ex: feat_0, feat_1...)
    train_df = pd.DataFrame(x_train).add_prefix('feat_')
    test_df = pd.DataFrame(x_test).add_prefix('feat_')

    # 3. Conversion et intégration des labels (2 colonnes)
    # On crée un DataFrame temporaire pour les labels avec des noms explicites
    y_train_df = pd.DataFrame(y_train, columns=['label_0', 'label_1'], index=train_df.index)
    y_test_df = pd.DataFrame(y_test, columns=['label_0', 'label_1'], index=test_df.index)

    # 4. Concaténation horizontale (colonnes de features + colonnes de labels)
    train_final = pd.concat([train_df, y_train_df], axis=1)
    test_final = pd.concat([test_df, y_test_df], axis=1)

    return train_final, test_final

# Exécution
train_df, test_df = create_dataframes(data_store_source)

# Vérification de la structure
print(f"Colonnes disponibles : {list(train_df.columns)}")
print(f"Forme finale Train : {train_df.shape}") # Devrait être (n, features + 2)

Colonnes disponibles : ['feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'feat_31', 'feat_32', 'feat_33', 'feat_34', 'feat_35', 'feat_36', 'feat_37', 'feat_38', 'feat_39', 'feat_40', 'feat_41', 'feat_42', 'feat_43', 'feat_44', 'feat_45', 'feat_46', 'feat_47', 'feat_48', 'feat_49', 'feat_50', 'label_0', 'label_1']
Forme finale Train : (41058, 53)


In [None]:
X_train_source = train_df.drop(columns=['label_0', 'label_1'])
y_train_source= train_df['label_1']

X_test_source = test_df.drop(columns=['label_0', 'label_1'])
y_test_source = test_df['label_1']

In [None]:
##Chargement des datasets cible

base_url = "https://raw.githubusercontent.com/lmuxz/SCDA/master/data/"
suffixes = ["test", "test_label", "train"]

# Dictionnaire pour stocker vos matrices numpy
data_store_target = {}

for i in range(4):
    for suffix in suffixes:
        file_name = f"kaggle_target_cate_{i}_{suffix}.npy"
        url = f"{base_url}{file_name}"

        response = requests.get(url)

        if response.status_code == 200:
            # numpy.load nécessite un objet "file-like", d'où l'utilisation de BytesIO
            data_store_target[file_name] = np.load(io.BytesIO(response.content))
            print(f"Chargé : {file_name} | Forme : {data_store_target[file_name].shape}")
        else:
            print(f"Échec pour {file_name} (Code : {response.status_code})")

# Exemple d'accès :
# train_data_0 = data_store['kaggle_source_cate_0_train.npy']

Chargé : kaggle_target_cate_0_test.npy | Forme : (20930, 51)
Chargé : kaggle_target_cate_0_test_label.npy | Forme : (20930, 2)
Chargé : kaggle_target_cate_0_train.npy | Forme : (62788, 51)
Chargé : kaggle_target_cate_1_test.npy | Forme : (20930, 51)
Chargé : kaggle_target_cate_1_test_label.npy | Forme : (20930, 2)
Chargé : kaggle_target_cate_1_train.npy | Forme : (62788, 51)
Chargé : kaggle_target_cate_2_test.npy | Forme : (20929, 51)
Chargé : kaggle_target_cate_2_test_label.npy | Forme : (20929, 2)
Chargé : kaggle_target_cate_2_train.npy | Forme : (62789, 51)
Chargé : kaggle_target_cate_3_test.npy | Forme : (20929, 51)
Chargé : kaggle_target_cate_3_test_label.npy | Forme : (20929, 2)
Chargé : kaggle_target_cate_3_train.npy | Forme : (62789, 51)


In [None]:
 # Passage en dataframe pandas des datasets cible :
all_train_target_dfs = []
all_test_target_dfs = []
datasets_target_par_index = {}

for i in range(4):
    print(f"Traitement de l'index {i}...")

    # 1. Récupération
    X_train_raw = data_store_target[f'kaggle_target_cate_{i}_train.npy']
    X_test_raw  = data_store_target[f'kaggle_target_cate_{i}_test.npy']
    y_test_raw  = data_store_target[f'kaggle_target_cate_{i}_test_label.npy']

    # 2. Conversion dynamique
    df_X_train = pd.DataFrame(X_train_raw).add_prefix('feat_')
    df_X_test  = pd.DataFrame(X_test_raw).add_prefix('feat_')

    # On crée les noms de colonnes dynamiquement pour les labels (label_0, label_1, etc.)
    col_labels = [f'label_{j}' for j in range(y_test_raw.shape[1])]
    df_y_test  = pd.DataFrame(y_test_raw, columns=col_labels)

    # 3. Concaténation horizontale
    df_train_full = df_X_train
    df_test_full  = pd.concat([df_X_test, df_y_test], axis=1)

    # Identification de la source
    df_train_full['source_index'] = i
    df_test_full['source_index'] = i

    datasets_target_par_index[i] = {'train': df_train_full, 'test': df_test_full}
    all_train_target_dfs.append(df_train_full)
    all_test_target_dfs.append(df_test_full)

# 4. Fusion finale
df_final_train_target = pd.concat(all_train_target_dfs, ignore_index=True)
df_final_test_target  = pd.concat(all_test_target_dfs, ignore_index=True)

print(f"\nTerminé ! Colonnes créées pour les labels : {col_labels}")
print(f"Format final du train : {df_final_train_target.shape}")

Traitement de l'index 0...
Traitement de l'index 1...
Traitement de l'index 2...
Traitement de l'index 3...

Terminé ! Colonnes créées pour les labels : ['label_0', 'label_1']
Format final du train : (251154, 52)


In [None]:
X_train_target = df_final_train_target.drop(columns=['source_index'])
X_test_target = df_final_test_target.drop(columns=['label_0', 'label_1', 'source_index'])
y_test_target = df_final_test_target['label_1']

In [None]:
print(df_final_train_target.head)

<bound method NDFrame.head of         feat_0  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  \
0          0.0    44.0     0.0   243.0    10.0     2.0    28.0     1.0   
1          0.0    44.0     1.0   437.0    39.0     4.0    96.0     2.0   
2          0.0    44.0     0.0   450.0    10.0     4.0    96.0     1.0   
3          1.0    44.0     0.0   427.0    65.0     4.0    32.0     2.0   
4          1.0    44.0     0.0   161.0    65.0     2.0    94.0     1.0   
...        ...     ...     ...     ...     ...     ...     ...     ...   
251149     1.0    56.0     0.0   100.0    65.0     4.0    32.0     2.0   
251150     0.0    53.0     0.0   385.0    65.0     2.0    94.0     1.0   
251151     0.0    53.0     0.0   298.0    65.0     2.0    94.0     2.0   
251152     1.0    44.0     0.0   464.0    39.0     4.0    96.0     1.0   
251153     0.0    83.0     0.0   298.0    65.0     2.0    94.0     2.0   

          feat_8    feat_9  ...   feat_42   feat_43   feat_44   feat_45  \
0     

In [None]:
#Procédure UDA
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report

def perform_domain_adaptation(X_source, y_source, X_target,X_eval, y_target_oracle):
    """
    Implémente l'adaptation de domaine par re-pondération d'importance
    et correction locale k-NN.
    """

    # --- 1. Calcul des poids d'importance via Classifieur de Domaine ---
    # On crée un dataset pour apprendre à distinguer Source (0) de Cible (1)
    X_domain = np.vstack([X_source, X_target])
    y_domain = np.hstack([np.zeros(len(X_source)), np.ones(len(X_target))])

    domain_clf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
    domain_clf.fit(X_domain, y_domain)

    # Probabilité d'appartenir à la cible P(Target|x)
    probs = domain_clf.predict_proba(X_source)[:, 1]

    # Poids d'importance w(x) = P(Target|x) / P(Source|x)
    # On ajoute un epsilon pour éviter la division par zéro
    weights_global = probs / (1 - probs + 1e-6)

    # --- 2. Affinement local par k-NN ---
    # On cherche à voir si un point source est "entouré" de points cibles
    knn_source = NearestNeighbors(n_neighbors=5).fit(X_source)
    distances, indices = knn_source.kneighbors(X_target)

    # On augmente le poids des points source qui sont les plus proches voisins de la cible
    local_counts = np.zeros(len(X_source))
    for idx_list in indices:
        local_counts[idx_list] += 1

    weights_local = local_counts / np.max(local_counts + 1e-6)

    # --- 3. Combinaison des poids et Normalisation ---
    # Fusion des approches globale (densité) et locale (voisinage)
    final_weights = weights_global * (1 + weights_local)
    final_weights = final_weights / np.mean(final_weights) # Normalisation

    # --- 4. Entraînement du modèle XGBoost re-pondéré ---
    # Note : On combine le class_weighting précédent avec les poids d'adaptation
    # via le paramètre sample_weight de la méthode fit()

    # Calcul du ratio pour le déséquilibre de classe (calculé sur Source)
    ratio = np.bincount(y_source)[0] / np.bincount(y_source)[1]

    model = XGBClassifier(
        learning_rate=0.1,
        max_depth=9,
        scale_pos_weight=ratio,
        random_state=42
    )

    # Entraînement sur Source avec les poids d'adaptation de domaine
    model.fit(X_source, y_source, sample_weight=final_weights)

    # --- 5. Évaluation "Oracle" sur le Domaine Cible ---
    y_pred_target = model.predict(X_eval)

    print("--- Rapport de Performance Oracle (Domaine Cible) ---")
    print(classification_report(y_target_oracle, y_pred_target))

    return model

# Appel de la fonction (exemple théorique)
# model_adapted = perform_domain_adaptation(X_src, y_src, X_tgt, y_tgt_oracle)

In [None]:
model_adapted = perform_domain_adaptation(X_source = X_train_source,
    y_source = y_train_source,
    X_target = X_train_target,
    X_eval = X_test_target,
    y_target_oracle = y_test_target)



--- Rapport de Performance Oracle (Domaine Cible) ---
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     78301
           1       0.44      0.70      0.54      5417

    accuracy                           0.92     83718
   macro avg       0.71      0.82      0.75     83718
weighted avg       0.94      0.92      0.93     83718



In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import recall_score, precision_score, f1_score
import scipy.linalg
# --- Initialisation des listes pour le tableau ---
noms_methodes = []
recalls = []
precisions = []
f1_scores = []

# --- Fonction utilitaire pour enregistrer les scores ---
def evaluer_et_stocker(nom, y_pred, y_true):
    noms_methodes.append(nom)
    recalls.append(recall_score(y_true, y_pred))
    precisions.append(precision_score(y_true, y_pred))
    f1_scores.append(f1_score(y_true, y_pred))

# ==========================================
# 1. FONCTIONS D'ADAPTATION
# ==========================================

def compute_importance_weights(X_src, X_tgt):
    """Calcule les poids d'importance (Global + Local)"""
    # Global
    X_dom = np.vstack([X_src, X_tgt])
    y_dom = np.hstack([np.zeros(len(X_src)), np.ones(len(X_tgt))])
    clf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42).fit(X_dom, y_dom)
    probs = clf.predict_proba(X_src)[:, 1]
    w_global = probs / (1 - probs + 1e-6)

    # Local
    knn = NearestNeighbors(n_neighbors=5).fit(X_src)
    _, indices = knn.kneighbors(X_tgt)
    counts = np.zeros(len(X_src))
    for idx in indices: counts[idx] += 1
    w_local = counts / (np.max(counts) + 1e-6)

    final_w = w_global * (1 + w_local)
    return final_w / np.mean(final_w)

def coral_alignment(X_src, X_tgt):
    """Aligne la covariance de la source sur celle de la cible"""
    # Centrage des données
    X_src_c = X_src - np.mean(X_src, axis=0)
    X_tgt_c = X_tgt - np.mean(X_tgt, axis=0)

    # Calcul des matrices de covariance
    cov_src = np.cov(X_src_c, rowvar=False) + np.eye(X_src.shape[1])
    cov_tgt = np.cov(X_tgt_c, rowvar=False) + np.eye(X_tgt.shape[1])

    # Transformation CORAL : X_src_new = X_src * cov_src^{-1/2} * cov_tgt^{1/2}
    inv_sqrt_src = scipy.linalg.inv(scipy.linalg.sqrtm(cov_src))
    sqrt_tgt = scipy.linalg.sqrtm(cov_tgt)

    X_src_coral = np.real(X_src_c @ inv_sqrt_src @ sqrt_tgt)
    return X_src_coral + np.mean(X_tgt, axis=0)

# ==========================================
# 2. PRÉPARATION
# ==========================================

# Conversion en arrays NumPy (si nécessaire)
X_train_source_np = np.array(X_train_source) if isinstance(X_train_source, pd.DataFrame) else X_train_source
X_train_target_np = np.array(X_train_target) if isinstance(X_train_target, pd.DataFrame) else X_train_target
X_test_target_np = np.array(X_test_target) if isinstance(X_test_target, pd.DataFrame) else X_test_target

# A. Calcul des composants UDA
weights_uda = compute_importance_weights(X_train_source_np, X_train_target_np)
X_train_source_coral = coral_alignment(X_train_source_np, X_train_target_np)

# Paramètres XGBoost
params = {
    'learning_rate': 0.1, 'max_depth': 9, 'random_state': 42,
    'scale_pos_weight': np.sum(y_train_source == 0) / np.sum(y_train_source == 1)
}

# ==========================================
# ENTRAÎNEMENT ET ÉVALUATION
# ==========================================

# 1. Baseline
m0 = XGBClassifier(**params).fit(X_train_source_np, y_train_source)
evaluer_et_stocker('Baseline', m0.predict(X_test_target_np), y_test_target)

# 2. Re-weighting (Poids w(x))
m1 = XGBClassifier(**params).fit(X_train_source_np, y_train_source, sample_weight=weights_uda)
evaluer_et_stocker('UDA (Re-weighting)', m1.predict(X_test_target_np), y_test_target)

# 3. CORAL (Alignement des features)
m2 = XGBClassifier(**params).fit(X_train_source_coral, y_train_source)
evaluer_et_stocker('UDA (CORAL)', m2.predict(X_test_target_np), y_test_target)

# 4. Combinée
m3 = XGBClassifier(**params).fit(X_train_source_coral, y_train_source, sample_weight=weights_uda)
evaluer_et_stocker('UDA (Combinée)', m3.predict(X_test_target_np), y_test_target)
# Pour CORAL, transformer aussi le test set
X_test_target_coral = coral_alignment(X_test_target_np, X_train_target_np)

# 3. CORAL (Alignement des features)
m2 = XGBClassifier(**params).fit(X_train_source_coral, y_train_source)
evaluer_et_stocker('UDA (CORAL)', m2.predict(X_test_target_coral), y_test_target)

# 4. Combinée
m3 = XGBClassifier(**params).fit(X_train_source_coral, y_train_source, sample_weight=weights_uda)
evaluer_et_stocker('UDA (Combinée)', m3.predict(X_test_target_coral), y_test_target)

# ==========================================
# CRÉATION DU DATAFRAME (SÉCURISÉE)
# ==========================================

df_comp = pd.DataFrame({
    'Méthode': noms_methodes,
    'Recall': recalls,
    'Précision': precisions,
    'F1-Score': f1_scores
})

print(df_comp)

              Méthode    Recall  Précision  F1-Score
0            Baseline  0.718663   0.427708  0.536263
1  UDA (Re-weighting)  0.696880   0.442919  0.541607
2         UDA (CORAL)  0.623777   0.543772  0.581033
3      UDA (Combinée)  0.594425   0.526919  0.558640
4         UDA (CORAL)  0.623777   0.543772  0.581033
5      UDA (Combinée)  0.594425   0.526919  0.558640


In [None]:
print(df_comp.to_latex(index=False)

SyntaxError: incomplete input (ipython-input-7504277.py, line 1)