In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlops_rakuten.config import RAW_DATA_DIR

In [None]:
print("Fichiers disponibles:")
print("*****************")
print("Fichiers texte :")
if RAW_DATA_DIR.exists():
    for f in sorted(RAW_DATA_DIR.glob('*.csv')):
        size_mb = f.stat().st_size / 1024 / 1024
        print(f"  {f.name}: {size_mb:.1f} MB")
print("*****************")

In [None]:
X_train = pd.read_csv(RAW_DATA_DIR / 'X_train_update.csv')
Y_train = pd.read_csv(RAW_DATA_DIR / 'Y_train_CVw08PX.csv')


dfs = {
    "X_train": X_train,
    "Y_train": Y_train,
}

print(f"Train: {X_train.shape}")

print(f" Classes: {Y_train['prdtypecode'].unique()} catégories\n")

# Aperçu des données
print("Schéma des données:")
print(X_train.columns)
print(Y_train.columns)

for name, df in dfs.items():
    print(f"\n{name}:")
    for col in df.columns:
        print(f"  - {col}: {df[col].dtype}")

In [None]:
train_data = X_train.merge(
    Y_train, 
    left_on=X_train.columns[0],  # Première colonne = index
    right_on=Y_train.columns[0]
)

# Afficher dimensions
print(f"train_data: {train_data.shape}")

# Aperçu des premières lignes
train_data.head()


In [None]:
# Compter les classes
class_counts = train_data.groupby('prdtypecode').size().reset_index(name='count').sort_values(by='count', ascending=False)

print(f" Nombre de classes uniques: {len(class_counts)}")
print(f"Distribution:")
print(class_counts.head(15))

# Stats basiques
counts = class_counts['count'].to_numpy()
print(f"\n Statistiques classes:")
print(f"  Min samples: {counts.min()}")
print(f"  Max samples: {counts.max()}")
print(f"  Mean: {counts.mean():.1f}")
print(f"  Median: {np.median(counts):.1f}")
print(f"  Std: {counts.std():.1f}")
print(f"  Ratio max/min: {counts.max()/counts.min():.1f}x")

# Déséquilibre?
if counts.max() / counts.min() > 10:
    print("\n  DATASET TRÈS DÉSÉQUILIBRÉ")

In [None]:
# Barplot distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Toutes les classes
ax1 = axes[0]

ax1.bar(range(len(class_counts)), class_counts['count'])
ax1.set_xlabel('Classes (triées par fréquence)', fontsize=12)
ax1.set_ylabel('Nombre de samples', fontsize=12)
ax1.set_title('Distribution de toutes les classes', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Top 15 classes
ax2 = axes[1]
top_15 = class_counts.head(15)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_15)))
ax2.barh(top_15['prdtypecode'].astype(str), top_15['count'], color=colors)
ax2.set_xlabel('Nombre de samples', fontsize=12)
ax2.set_ylabel('Code produit', fontsize=12)
ax2.set_title('Top 15 classes les plus fréquentes', fontsize=14, fontweight='bold')
ax2.invert_yaxis()
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
def analyse_colonnes(df, colonnes):
    for col in colonnes:
        print("=" * 80)
        print(f"STATISTIQUES - {col}")
        print("=" * 80)

        # Copie pour éviter les warnings
        stats = df.copy()

        # Colonnes dérivées
        stats[f'{col}_missing'] = stats[col].isnull()
        stats[f'{col}_len'] = stats[col].dropna().apply(len)
        stats[f'{col}_words'] = stats[col].dropna().apply(lambda x: len(x.split()))

        # Taux de remplissage
        missing_pct = stats[f'{col}_missing'].mean() * 100
        print(f"{col} manquantes: {missing_pct:.1f}%")

        # Stats descriptives
        col_len = stats.loc[~stats[f'{col}_missing'], f'{col}_len'].to_numpy()
        col_words = stats.loc[~stats[f'{col}_missing'], f'{col}_words'].to_numpy()

        if len(col_len) > 0:
            print(f"\n Stats longueur {col}:")
            print(f"  Count: {len(col_len)}")
            print(f"  Min: {col_len.min()}")
            print(f"  Max: {col_len.max()}")
            print(f"  Mean: {col_len.mean():.1f}")
            print(f"  Median: {np.median(col_len):.1f}")
            print(f"  Std: {col_len.std():.1f}")

            # Histogramme
            fig, ax = plt.subplots(figsize=(12, 5))
            ax.hist(col_len, bins=50, edgecolor='black', alpha=0.7)
            ax.axvline(np.mean(col_len), color='red', linestyle='--',
                      label=f'Moyenne: {np.mean(col_len):.1f}')
            ax.set_xlabel('Longueur (caractères)', fontsize=12)
            ax.set_ylabel('Fréquence', fontsize=12)
            ax.set_title(f'Distribution longueur {col}', fontsize=14, fontweight='bold')
            ax.legend()
            ax.grid(alpha=0.3)
            plt.show()

        # Exemples
        print(f"\n Exemples de {col}:")
        with_col = stats.loc[~stats[f'{col}_missing'], col].head(3)
        for val in with_col:
            print(f"  {val}")

In [None]:
# Analyser les colonnes textuelles
analyse_colonnes(train_data, ["designation", "description"])


In [None]:
duplicates = (
    train_data.groupby("designation")
    .agg(
        count=("designation", "size"),
        unique_categories=("prdtypecode", "nunique"),
        unique_descriptions=("description", "nunique"),
    )
    .reset_index()
)

duplicates = duplicates[duplicates["count"] > 1].sort_values("count", ascending=False)

print("="*70)
print(" VUE D'ENSEMBLE")
print("="*70)
print(f"Total désignations dupliquées: {len(duplicates):,}")
print(f"Doublons avec descriptions identiques: {len(duplicates[duplicates['unique_descriptions'] == 1]):,}")
print(f"Doublons avec descriptions différentes: {len(duplicates[duplicates['unique_descriptions'] > 1]):,}")
print()


In [None]:
# Identifier les doublons
duplicates = (
    train_data
    .groupby("designation")
    .agg(
        count=("designation", "size"),
        unique_categories=("prdtypecode", "nunique"),
        unique_descriptions=("description", "nunique"),
    )
    .reset_index()
)

# Ne garder que les désignations présentes plusieurs fois
duplicates = duplicates[duplicates["count"] > 1].sort_values("count", ascending=False)

print(f"{'='*70}")
print(f"VUE D'ENSEMBLE - DOUBLONS")
print(f"{'='*70}")

# Stats globales
print(f"\n Statistiques Globales:")
print(f"   Total désignations dupliquées:         {len(duplicates):,}")
print(f"   Produits concernés (total):            {duplicates['count'].sum():,}")

# Par descriptions
print(f"\n Par Descriptions:")
print(f"   Descriptions identiques:               {len(duplicates[duplicates['unique_descriptions'] == 1]):,}")
print(f"   Descriptions différentes:              {len(duplicates[duplicates['unique_descriptions'] > 1]):,}")

# Par catégories
print(f"\n  Par Catégories:")
print(f"   Catégories identiques:                 {len(duplicates[duplicates['unique_categories'] == 1]):,}")
print(f"   Catégories différentes (CONFLITS):     {len(duplicates[duplicates['unique_categories'] > 1]):,}")

# Matrice croisée
print(f"\n Matrice Croisée (Descriptions × Catégories):")
print(f"{'':40s} {'Cat=1':>15s} {'Cat>1':>15s} {'Total':>15s}")
print(f"{'-'*70}")

desc_1_cat_1 = len(duplicates[(duplicates['unique_descriptions'] == 1) & (duplicates['unique_categories'] == 1)])
desc_1_cat_n = len(duplicates[(duplicates['unique_descriptions'] == 1) & (duplicates['unique_categories'] > 1)])
desc_n_cat_1 = len(duplicates[(duplicates['unique_descriptions'] > 1) & (duplicates['unique_categories'] == 1)])
desc_n_cat_n = len(duplicates[(duplicates['unique_descriptions'] > 1) & (duplicates['unique_categories'] > 1)])

desc_1_total = desc_1_cat_1 + desc_1_cat_n
desc_n_total = desc_n_cat_1 + desc_n_cat_n

print(f"{'Desc identiques (=1)':40s} {desc_1_cat_1:>15,} {desc_1_cat_n:>15,} {desc_1_total:>15,}")
print(f"{'Desc différentes (>1)':40s} {desc_n_cat_1:>15,} {desc_n_cat_n:>15,} {desc_n_total:>15,}")
print(f"{'-'*70}")
print(f"{'Total':40s} {desc_1_cat_1 + desc_n_cat_1:>15,} {desc_1_cat_n + desc_n_cat_n:>15,} {len(duplicates):>15,}")

# Pourcentages
print(f"\n Proportions:")
if len(duplicates) > 0:
    print(f"   Cas problématiques (Desc=1, Cat>1):   {desc_1_cat_n:>6,} ({desc_1_cat_n/len(duplicates)*100:>5.1f}%)")
    print(f"   Variantes légitimes (Desc>1, Cat=1):  {desc_n_cat_1:>6,} ({desc_n_cat_1/len(duplicates)*100:>5.1f}%)")
    print(f"   Ambiguïtés (Desc>1, Cat>1):           {desc_n_cat_n:>6,} ({desc_n_cat_n/len(duplicates)*100:>5.1f}%)")
    print(f"   Parfaits doublons (Desc=1, Cat=1):    {desc_1_cat_1:>6,} ({desc_1_cat_1/len(duplicates)*100:>5.1f}%)")

print(f"\n{'='*70}\n")
