In [2]:
import pandas as pd
import requests
from io import StringIO
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from shapely.geometry import Point
import folium

url = "https://raw.githubusercontent.com/Amina212004/MINI_PROJET_ML/refs/heads/data_branch/data/meteorites_final.csv"
token = "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"  

headers = {"Authorization": f"token {token}"}
r = requests.get(url, headers=headers)
r.raise_for_status()  

df = pd.read_csv(StringIO(r.text))
df.head()


Unnamed: 0,name,year_period,year,recclass,continent,country,mass_cleaned,mass_bin,recclass_clean,fall,reclat,hemisphere,reclong,longitude_zone,period_continent
0,Aachen,19th Century,1880.0,L5,Europe,Belgium,21.0,10-100g,L5,Fell,50.775,North,6.08333,East,19th Century_Europe
1,Aarhus,20th Century,1951.0,H6,Europe,Denmark,720.0,100-1kg,H6,Fell,56.18333,North,10.23333,East,20th Century_Europe
2,Abee,20th Century,1952.0,EH4,North America,Canada,50370.0,>10kg,H4,Fell,54.21667,North,-113.0,West,20th Century_North America
3,Acapulco,20th Century,1976.0,Acapulcoite,North America,Mexico,1914.0,1-10kg,OTHER,Fell,16.88332,North,-99.9,West,20th Century_North America
4,Achiras,20th Century,1902.0,L6,South America,Argentina,780.0,100-1kg,L6,Fell,-33.16667,South,-64.95,West,20th Century_South America


In [22]:
import pandas as pd
import numpy as np
import folium
from mlxtend.frequent_patterns import apriori, association_rules

# =============================
# PARTIE 1 : APRIORI ET ANALYSE
# =============================

# -----------------------------
# Pr√©paration Apriori
# -----------------------------
df_apriori = pd.get_dummies(df[['year_period', 'mass_bin', 'continent', 'country', 'recclass_clean']])

# ‚úÖ AM√âLIORATION : Support plus bas pour capturer plus de patterns de types rares
frequent_itemsets = apriori(df_apriori, min_support=0.005, use_colnames=True)

# ‚úÖ AM√âLIORATION : Confidence plus basse pour capturer plus de r√®gles de type
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.2)


# üîç ANALYSE DES R√àGLES G√âN√âR√âES
print("\n" + "=" * 100)
print("üîç ANALYSE DES R√àGLES D'ASSOCIATION G√âN√âR√âES")
print("=" * 100)
print(f"\nüìä Total de r√®gles brutes : {len(rules)}")

# Compter les r√®gles par type de cons√©quent
type_rules_count = rules[rules['consequents'].apply(lambda x: any('recclass_clean_' in str(item) for item in x))]
geo_rules_count = rules[rules['consequents'].apply(lambda x: any('country_' in str(item) or 'continent_' in str(item) for item in x))]
mass_rules_count = rules[rules['consequents'].apply(lambda x: any('mass_bin_' in str(item) for item in x))]
year_rules_count = rules[rules['consequents'].apply(lambda x: any('year_period_' in str(item) for item in x))]

print(f"\nüìã R√©partition par type de pr√©diction :")
print(f"   ‚Ä¢ R√®gles pr√©disant TYPE  : {len(type_rules_count):4d} ({len(type_rules_count)/len(rules)*100:5.1f}%)")
print(f"   ‚Ä¢ R√®gles pr√©disant GEO   : {len(geo_rules_count):4d} ({len(geo_rules_count)/len(rules)*100:5.1f}%)")
print(f"   ‚Ä¢ R√®gles pr√©disant MASSE : {len(mass_rules_count):4d} ({len(mass_rules_count)/len(rules)*100:5.1f}%)")
print(f"   ‚Ä¢ R√®gles pr√©disant ANN√âE : {len(year_rules_count):4d} ({len(year_rules_count)/len(rules)*100:5.1f}%)")

print(f"\nüéØ Exemples de r√®gles pr√©disant un TYPE (top 5 par confiance) :")
if not type_rules_count.empty:
    top_type_rules = type_rules_count.sort_values('confidence', ascending=False).head(5)
    print(top_type_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_string(index=False))
else:
    print("   ‚ö†Ô∏è  Aucune r√®gle pr√©disant un type trouv√©e !")

print("=" * 100 + "\n")

# -----------------------------
# Cr√©ation de mappings pour optimisation
# -----------------------------
YEAR_TO_PERIOD = df.dropna(subset=['year']).set_index('year')['year_period'].to_dict()
MASS_BIN_RANGES = {}

for mb in df['mass_bin'].dropna().unique():
    try:
        parts = mb.replace('g', '').replace('kg', '000').split('-')
        if len(parts) == 2:
            MASS_BIN_RANGES[mb] = (float(parts[0]), float(parts[1]))
    except (ValueError, AttributeError):
        continue


# -----------------------------
# Filtrer les tautologies g√©ographiques
# -----------------------------
def is_geographic_tautology(row):
    """
    D√©tecte les r√®gles tautologiques comme {continent_X} ‚Üí {country_X}
    ou {country_X} ‚Üí {continent_X}
    """
    antecedents = row['antecedents']
    consequents = row['consequents']
    
    # V√©rifier si une r√®gle relie continent ‚Üî country
    has_continent_ant = any('continent_' in str(item) for item in antecedents)
    has_country_cons = any('country_' in str(item) for item in consequents)
    has_country_ant = any('country_' in str(item) for item in antecedents)
    has_continent_cons = any('continent_' in str(item) for item in consequents)
    
    return (has_continent_ant and has_country_cons) or (has_country_ant and has_continent_cons)


def is_type_prediction_rule(row):
    """
    V√©rifie si la r√®gle pr√©dit un TYPE de m√©t√©orite (recclass_clean)
    """
    return any('recclass_clean_' in str(item) for item in row['consequents'])


# -----------------------------
# Filtrer r√®gles selon crit√®res (VERSION OPTIMIS√âE + NETTOY√âE)
# -----------------------------
def filter_rules(rules, years=None, mass_bins=None, continents=None):
    """
    Filtre les r√®gles d'association selon les crit√®res utilisateur.
    √âlimine les tautologies g√©ographiques et priorise les r√®gles de TYPE.
    """
    user_criteria = set()

    # Traitement des ann√©es
    if years:
        for y in years:
            if isinstance(y, (list, tuple)) and len(y) == 2:
                relevant_years = [yr for yr in YEAR_TO_PERIOD.keys() if y[0] <= yr <= y[1]]
                user_criteria.update(f'year_period_{YEAR_TO_PERIOD[yr]}' for yr in relevant_years)
            else:
                if y in YEAR_TO_PERIOD:
                    user_criteria.add(f'year_period_{YEAR_TO_PERIOD[y]}')

    # Traitement des masses
    if mass_bins:
        for m in mass_bins:
            if isinstance(m, (list, tuple)) and len(m) == 2:
                for mb, (low, high) in MASS_BIN_RANGES.items():
                    if m[0] <= low and high <= m[1]:
                        user_criteria.add(f'mass_bin_{mb}')
            else:
                user_criteria.add(f'mass_bin_{m}')

    # Traitement des continents
    if continents:
        user_criteria.update(f'continent_{c}' for c in continents)

    if not user_criteria:
        # Aucun crit√®re : retourner toutes les r√®gles NON tautologiques de TYPE
        filtered = rules[~rules.apply(is_geographic_tautology, axis=1)]
        filtered = filtered[filtered.apply(is_type_prediction_rule, axis=1)]
        return filtered

    # Filtrage : user_criteria dans les ant√©c√©dents
    filtered = rules[rules['antecedents'].apply(lambda x: user_criteria.issubset(x))]
    
    # ‚úÖ AM√âLIORATION 1 : √âliminer les tautologies g√©ographiques
    filtered = filtered[~filtered.apply(is_geographic_tautology, axis=1)]
    
    # ‚úÖ AM√âLIORATION 2 : Prioriser les r√®gles qui pr√©disent des TYPES
    type_rules = filtered[filtered.apply(is_type_prediction_rule, axis=1)]
    
    if not type_rules.empty:
        return type_rules
    else:
        # Fallback : retourner les r√®gles non-tautologiques m√™me si pas de type
        return filtered


# -----------------------------
# Type le plus probable avec score combin√©
# -----------------------------
def get_most_probable_type(filtered_rules, df_subset):
    """
    D√©termine le type de m√©t√©orite le plus probable selon les r√®gles ou les donn√©es.
    """
    type_scores = {}
    
    if not filtered_rules.empty:
        for _, row in filtered_rules.iterrows():
            for item in row['consequents']:
                if 'recclass_clean_' in item:
                    type_name = item.replace('recclass_clean_', '')
                    type_scores[type_name] = type_scores.get(type_name, 0) + row['confidence'] * row['support']

    if type_scores:
        top_type = max(type_scores, key=type_scores.get)
        prob = type_scores[top_type] / sum(type_scores.values())
    else:
        # Fallback : fr√©quences brutes
        if not df_subset.empty and 'recclass_clean' in df_subset.columns:
            top_type = df_subset['recclass_clean'].value_counts().idxmax()
            prob = df_subset['recclass_clean'].value_counts(normalize=True).max()
        else:
            top_type = "Unknown"
            prob = 0.0

    return top_type, prob, type_scores


# -----------------------------
# Pr√©dire valeurs manquantes selon type (VERSION CONTEXTUELLE)
# -----------------------------
def predict_missing_criteria(df, top_type, user_years=None, user_mass=None, user_continents=None):
    """
    Pr√©dit les crit√®res manquants bas√©s sur le type ET les crit√®res d√©j√† sp√©cifi√©s.
    ‚úÖ AM√âLIORATION : Pr√©diction contextuelle (prend en compte les crit√®res existants).
    """
    df_type = df[df['recclass_clean'] == top_type].copy()
    
    # ‚úÖ Filtrer d'abord par les crit√®res utilisateur existants
    if user_years:
        years_flat = []
        for y in user_years:
            if isinstance(y, (list, tuple)) and len(y) == 2:
                years_flat.extend(range(y[0], y[1] + 1))
            else:
                years_flat.append(y)
        df_type = df_type[df_type['year'].isin(years_flat)]
    
    if user_mass:
        mass_mask = pd.Series(False, index=df_type.index)
        for m in user_mass:
            if isinstance(m, (list, tuple)) and len(m) == 2:
                mass_mask |= df_type['mass_cleaned'].between(m[0], m[1])
            else:
                mass_mask |= (df_type['mass_bin'] == m)
        df_type = df_type[mass_mask]
    
    if user_continents:
        df_type = df_type[df_type['continent'].isin(user_continents)]
    
    # ‚úÖ Maintenant pr√©dire bas√© sur ce sous-ensemble contextuel
    year_pred = user_years if user_years else (df_type['year_period'].mode()[0] if not df_type.empty else None)
    mass_pred = user_mass if user_mass else (df_type['mass_bin'].mode()[0] if not df_type.empty else None)
    continent_pred = user_continents if user_continents else (df_type['continent'].mode()[0] if not df_type.empty else None)

    return year_pred, mass_pred, continent_pred


# -----------------------------
# Infos selon crit√®res R√âELS (utilisateur + pr√©dictions)
# -----------------------------
def get_type_info(df, top_type, user_years=None, user_mass=None, user_continents=None,
                  pred_year=None, pred_mass=None, pred_continent=None):
    """
    R√©cup√®re les informations des m√©t√©orites correspondant au type ET aux crit√®res.
    ‚úÖ AM√âLIORATION : Utilise les pr√©dictions si l'utilisateur n'a rien sp√©cifi√©.
    """
    df_type = df[df['recclass_clean'] == top_type].copy()

    # ‚úÖ Utiliser les crit√®res utilisateur OU les pr√©dictions
    final_years = user_years
    final_mass = user_mass
    final_continents = user_continents if user_continents else ([pred_continent] if pred_continent else None)

    # Filtrage par ann√©e
    if final_years:
        years_flat = []
        for y in final_years:
            if isinstance(y, (list, tuple)) and len(y) == 2:
                years_flat.extend(range(y[0], y[1] + 1))
            else:
                years_flat.append(y)
        df_type = df_type[df_type['year'].isin(years_flat)]

    # Filtrage par masse
    if final_mass:
        mass_mask = pd.Series(False, index=df_type.index)
        for m in final_mass:
            if isinstance(m, (list, tuple)) and len(m) == 2:
                mass_mask |= df_type['mass_cleaned'].between(m[0], m[1])
            else:
                mass_mask |= (df_type['mass_bin'] == m)
        df_type = df_type[mass_mask]

    # Filtrage par continent
    if final_continents:
        df_type = df_type[df_type['continent'].isin(final_continents)]

    names = df_type['name'].tolist()
    countries = df_type['country'].dropna().unique().tolist()
    sample_years = df_type['year'].dropna().tolist()
    mass_bin = df_type['mass_bin'].mode()[0] if not df_type.empty else None

    return names, countries, sample_years, mass_bin, df_type


# -----------------------------
# Statistiques de qualit√© des r√®gles
# -----------------------------
def get_rules_statistics(filtered_rules):
    """
    Calcule des statistiques sur la qualit√© des r√®gles filtr√©es.
    """
    if filtered_rules.empty:
        return {
            'total': 0,
            'type_rules': 0,
            'geo_rules': 0,
            'mean_confidence': 0,
            'mean_lift': 0
        }
    
    type_rules = filtered_rules[filtered_rules.apply(is_type_prediction_rule, axis=1)]
    
    return {
        'total': len(filtered_rules),
        'type_rules': len(type_rules),
        'geo_rules': len(filtered_rules) - len(type_rules),
        'mean_confidence': filtered_rules['confidence'].mean(),
        'mean_lift': filtered_rules['lift'].mean()
    }


# -----------------------------
# Traitement d'une s√©lection utilisateur (AM√âLIOR√â)
# -----------------------------
def process_user_selection(sel, rules, df):
    """
    Traite une s√©lection utilisateur compl√®te.
    ‚úÖ AM√âLIORATION : Applique les pr√©dictions au filtrage des points sur la carte.
    """
    # Filtrage des r√®gles
    filtered_rules = filter_rules(rules, sel.get('years'), sel.get('mass'), sel.get('continents'))
    
    # Pr√©diction du type
    top_type, prob, scores = get_most_probable_type(filtered_rules, df)
    
    # Pr√©diction des crit√®res manquants
    year_pred, mass_pred, cont_pred = predict_missing_criteria(
        df, top_type, sel.get('years'), sel.get('mass'), sel.get('continents')
    )
    
    # ‚úÖ R√©cup√©ration des infos avec application des pr√©dictions
    names, countries, sample_years, mass_bin, df_points = get_type_info(
        df, top_type, 
        sel.get('years'), sel.get('mass'), sel.get('continents'),
        year_pred, mass_pred, cont_pred
    )
    
    # Statistiques des r√®gles
    stats = get_rules_statistics(filtered_rules)
    
    return {
        'selection': sel,
        'filtered_rules': filtered_rules,
        'top_type': top_type,
        'probability': prob,
        'type_scores': scores,
        'year_pred': year_pred,
        'mass_pred': mass_pred,
        'continent_pred': cont_pred,
        'names': names,
        'countries': countries,
        'sample_years': sample_years,
        'mass_bin': mass_bin,
        'df_points': df_points,
        'stats': stats
    }


# -----------------------------
# Carte interactive multi-exemples
# -----------------------------
def plot_examples_on_map(examples_info, colors=None, map_file='map.html'):
    """
    Affiche plusieurs exemples sur une carte avec des couleurs diff√©rentes.
    """
    if colors is None:
        colors = ['hotpink', 'purple', 'orange', 'blue', 'green', 'darkred', 'yellow', 'red']
    
    m = folium.Map(location=[0, 0], zoom_start=2)
    
    for i, info in enumerate(examples_info):
        df_points = info['df_points']
        color = colors[i % len(colors)]
        
        for _, row in df_points.iterrows():
            if pd.notna(row.get('reclat')) and pd.notna(row.get('reclong')):
                folium.CircleMarker(
                    location=[row['reclat'], row['reclong']],
                    radius=4,
                    popup=f"<b>{row['name']}</b><br>{row.get('mass_cleaned', 'N/A')} g<br>{row.get('country', 'N/A')}",
                    color=color,
                    fill=True,
                    fill_color=color,
                    fill_opacity=0.7
                ).add_to(m)
    
    m.save(map_file)
    print(f"‚úÖ Carte sauvegard√©e : {map_file}")
    return m


# -----------------------------
# EX√âCUTION : Exemples utilisateur
# -----------------------------
user_selections = [
    {"years": [(1994, 2006)], "mass": ['10-100g'], "continents": ['Africa']},      
    {"years": [2000], "mass": None, "continents": ['Asia']},               
    {"years": [(1900, 1950)], "mass": None, "continents": None},           
]

print("\n" + "=" * 100)
print(" " * 30 + "üå† ANALYSE DES S√âLECTIONS UTILISATEUR üå†")
print("=" * 100)

examples_info = []

for i, sel in enumerate(user_selections, 1):
    print(f"\n{'‚ñà' * 100}")
    print(f"{'‚ñà'} {f'EXEMPLE {i}'.center(96)} {'‚ñà'}")
    print(f"{'‚ñà' * 100}")
    
    # Affichage des crit√®res d'entr√©e
    print(f"\n{'‚îÄ' * 100}")
    print("üì• CRIT√àRES D'ENTR√âE :")
    print(f"{'‚îÄ' * 100}")
    print(f"   ‚Ä¢ Ann√©es      : {sel.get('years') if sel.get('years') else '‚ùå Non sp√©cifi√© (sera pr√©dit)'}")
    print(f"   ‚Ä¢ Masse       : {sel.get('mass') if sel.get('mass') else '‚ùå Non sp√©cifi√© (sera pr√©dit)'}")
    print(f"   ‚Ä¢ Continents  : {sel.get('continents') if sel.get('continents') else '‚ùå Non sp√©cifi√© (sera pr√©dit)'}")
    
    # Traitement unifi√©
    result = process_user_selection(sel, rules, df)
    examples_info.append(result)
    
    # Affichage du type pr√©dit
    print(f"\n{'‚îÄ' * 100}")
    print("üéØ PR√âDICTION DU TYPE :")
    print(f"{'‚îÄ' * 100}")
    print(f"   ‚Ä¢ Type probable       : {result['top_type']}")
    print(f"   ‚Ä¢ Probabilit√©         : {result['probability']:.2%}")
    print(f"   ‚Ä¢ M√©t√©orites trouv√©es : {len(result['df_points'])} points sur la carte")
    
    # Affichage des crit√®res
    print(f"\n{'‚îÄ' * 100}")
    print("üìä CRIT√àRES FINAUX (utilis√©s pour filtrer les points sur la carte) :")
    print(f"{'‚îÄ' * 100}")
    
    print(f"\n   üìÖ ANN√âES :")
    if sel.get('years'):
        print(f"      ‚úì Crit√®re utilisateur : {sel['years']}")
    else:
        print(f"      üîÆ Pr√©diction         : {result['year_pred']}")
        if result['sample_years']:
            print(f"      üìù Exemples d'ann√©es  : {result['sample_years'][:15]}")
    
    print(f"\n   ‚öñÔ∏è  MASSE :")
    if sel.get('mass'):
        print(f"      ‚úì Crit√®re utilisateur : {sel['mass']}")
    else:
        print(f"      üîÆ Pr√©diction         : {result['mass_pred']}")
    
    print(f"\n   üåç CONTINENT :")
    if sel.get('continents'):
        print(f"      ‚úì Crit√®re utilisateur : {sel['continents']}")
    else:
        print(f"      üîÆ Pr√©diction (appliqu√©e au filtrage) : {result['continent_pred']}")
        print(f"      ‚ÑπÔ∏è  Les points de la carte sont filtr√©s selon cette pr√©diction")
    
    # Informations g√©ographiques
    print(f"\n{'‚îÄ' * 100}")
    print("üó∫Ô∏è  INFORMATIONS G√âOGRAPHIQUES :")
    print(f"{'‚îÄ' * 100}")
    print(f"   ‚Ä¢ Nombre de pays : {len(result['countries'])}")
    if result['countries']:
        print(f"   ‚Ä¢ Pays (√©chantillon) : {', '.join(result['countries'][:15])}")
        if len(result['countries']) > 15:
            print(f"     ... et {len(result['countries']) - 15} autres pays")
    else:
        print(f"   ‚Ä¢ ‚ö†Ô∏è  Aucun pays trouv√© avec ces crit√®res")
    
    # Exemples de m√©t√©orites
    print(f"\n{'‚îÄ' * 100}")
    print("‚òÑÔ∏è  EXEMPLES DE M√âT√âORITES :")
    print(f"{'‚îÄ' * 100}")
    print(f"   ‚Ä¢ Nombre total : {len(result['names'])}")
    if result['names']:
        print(f"   ‚Ä¢ √âchantillon  : {', '.join(result['names'][:15])}")
        if len(result['names']) > 15:
            print(f"     ... et {len(result['names']) - 15} autres m√©t√©orites")
    else:
        print(f"   ‚Ä¢ ‚ö†Ô∏è  Aucune m√©t√©orite trouv√©e avec ces crit√®res")
    
    # ‚úÖ AM√âLIORATION 3 : Statistiques de qualit√© des r√®gles
    print(f"\n{'‚îÄ' * 100}")
    print("üìã R√àGLES D'ASSOCIATION UTILIS√âES :")
    print(f"{'‚îÄ' * 100}")
    
    if result['stats']['total'] > 0:
        print(f"   ‚úÖ {result['stats']['total']} r√®gle(s) trouv√©e(s)")
        print(f"   üìä Statistiques :")
        print(f"      ‚Ä¢ R√®gles pr√©disant un TYPE      : {result['stats']['type_rules']}")
        print(f"      ‚Ä¢ R√®gles g√©ographiques/autres   : {result['stats']['geo_rules']}")
        print(f"      ‚Ä¢ Confiance moyenne             : {result['stats']['mean_confidence']:.2%}")
        print(f"      ‚Ä¢ Lift moyen                    : {result['stats']['mean_lift']:.2f}")
        
        print(f"\n   üîç Top 10 r√®gles (tri√©es par confiance puis support) :")
        top_rules = result['filtered_rules'][['antecedents', 'consequents', 'support', 'confidence', 'lift']]\
            .sort_values(['confidence', 'support'], ascending=False).head(10)
        print(top_rules.to_string(index=False))
    else:
        print(f"   ‚ö†Ô∏è  Aucune r√®gle d'association trouv√©e car la combinaison des criteres est trop rare")
        print(f"   ‚ÑπÔ∏è  Utilisation des fr√©quences brutes du dataset (fallback mode)")
    
    print(f"\n{'‚ñà' * 100}\n")

# G√©n√©ration de la carte
print(f"\n{'=' * 100}")
print(" " * 35 + "üó∫Ô∏è  G√âN√âRATION DE LA CARTE INTERACTIVE üó∫Ô∏è")
print(f"{'=' * 100}\n")

print("üìç L√©gende des couleurs sur la carte :")
colors_legend = ['üî¥ Rose','üü£ Violet' ,'üü† Orange', 'üîµ Bleu', 'üü¢ Vert']
for i, (sel, color) in enumerate(zip(user_selections, colors_legend), 1):
    result = examples_info[i-1]
    years_str = str(sel.get('years')) if sel.get('years') else f"Pr√©dit: {result['year_pred']}"
    mass_str = str(sel.get('mass')) if sel.get('mass') else f"Pr√©dit: {result['mass_pred']}"
    cont_str = str(sel.get('continents')) if sel.get('continents') else f"Pr√©dit: {result['continent_pred']}"
    print(f"   {color} - Exemple {i} : Ann√©es={years_str}, Masse={mass_str}, Continent={cont_str}")

print()
plot_examples_on_map(examples_info, map_file='map.html')

print(f"\n{'=' * 100}")
print("‚úÖ Analyse termin√©e ! Ouvrez 'map.html' pour voir la carte interactive.")
print(f"{'=' * 100}\n")


üîç ANALYSE DES R√àGLES D'ASSOCIATION G√âN√âR√âES

üìä Total de r√®gles brutes : 2552

üìã R√©partition par type de pr√©diction :
   ‚Ä¢ R√®gles pr√©disant TYPE  :  245 (  9.6%)
   ‚Ä¢ R√®gles pr√©disant GEO   : 1609 ( 63.0%)
   ‚Ä¢ R√®gles pr√©disant MASSE :  821 ( 32.2%)
   ‚Ä¢ R√®gles pr√©disant ANN√âE :  935 ( 36.6%)

üéØ Exemples de r√®gles pr√©disant un TYPE (top 5 par confiance) :
                                                antecedents                                                     consequents  support  confidence      lift
                  (year_period_20th Century, country_Kenya)                           (continent_Africa, recclass_clean_H6) 0.005638    0.800000 56.257269
(year_period_20th Century, continent_Africa, country_Kenya)                                             (recclass_clean_H6) 0.005638    0.800000  7.907368
                  (year_period_20th Century, country_Kenya)                                             (recclass_clean_H6) 0.005638    0.8

In [9]:
# =============================
# PARTIE 2 : √âVALUATION QUALIT√â
# =============================

def evaluate_rules_quality(rules):
    """
    √âvalue la qualit√© des r√®gles d'association.
    ‚úÖ AM√âLIORATION : Exclut les tautologies g√©ographiques de l'√©valuation.
    """
    # Filtrer les tautologies
    eval_df = rules[~rules.apply(is_geographic_tautology, axis=1)].copy()
    
    print(f"\n‚ö†Ô∏è  {len(rules) - len(eval_df)} tautologies g√©ographiques exclues de l'√©valuation")

    conditions = [
        (eval_df['support'] >= 0.01) & (eval_df['confidence'] >= 0.7) & (eval_df['lift'] >= 1.2),
        (eval_df['support'] >= 0.005) & (eval_df['confidence'] >= 0.5) & (eval_df['lift'] >= 1.0),
        (eval_df['support'] < 0.005) | (eval_df['confidence'] < 0.5) | (eval_df['lift'] < 1.0)
    ]
    labels = ['Forte', 'Moyenne', 'Faible']

    eval_df['quality'] = np.select(conditions, labels, default='Faible')

    summary = eval_df['quality'].value_counts()
    print("\n" + "=" * 80)
    print("√âVALUATION DE LA QUALIT√â DES R√àGLES")
    print("=" * 80)
    print("\nüìä R√©sum√© des r√®gles (hors tautologies) :")
    print(summary)

    print("\nüîç Top 5 r√®gles fortes (lift √©lev√©) :")
    strong_rules = eval_df[eval_df['quality'] == 'Forte'].sort_values('lift', ascending=False).head(5)
    if not strong_rules.empty:
        print(strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_string())
    else:
        print("   Aucune r√®gle forte trouv√©e.")

    return eval_df

# Lancer l'√©valuation
evaluated_rules = evaluate_rules_quality(rules)


‚ö†Ô∏è  845 tautologies g√©ographiques exclues de l'√©valuation

√âVALUATION DE LA QUALIT√â DES R√àGLES

üìä R√©sum√© des r√®gles (hors tautologies) :
quality
Faible     1090
Moyenne     430
Forte       187
Name: count, dtype: int64

üîç Top 5 r√®gles fortes (lift √©lev√©) :
                                               antecedents                 consequents   support  confidence      lift
1794  (country_Oman, recclass_clean_OTHER, continent_Asia)  (year_period_21st Century)  0.011934    0.989610  3.566753
645                   (country_Oman, recclass_clean_OTHER)  (year_period_21st Century)  0.011934    0.989610  3.566753
520                       (country_Oman, mass_bin_10-100g)  (year_period_21st Century)  0.023523    0.967784  3.488085
1611      (country_Oman, continent_Asia, mass_bin_10-100g)  (year_period_21st Century)  0.023523    0.967784  3.488085
60                                          (country_Oman)  (year_period_21st Century)  0.088580    0.945503  3.407782


In [3]:
df[(df['year'] == 1950) & (df['mass_bin'] == '10-100g') & (df['continent'] == 'Europe')]

Unnamed: 0,name,year_period,year,recclass,continent,country,mass_cleaned,mass_bin,recclass_clean,fall,reclat,hemisphere,reclong,longitude_zone,period_continent
4934,Barcis,20th Century,1950.0,"Pallasite, PMG",Europe,Italy,87.0,10-100g,OTHER,Found,46.1,North,12.35,East,20th Century_Europe
