In [5]:
# %load explore.py
"""
Script d'analyse exploratoire des donn√©es
Alternative au notebook pour une ex√©cution rapide
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

def load_data():
    """Charger les donn√©es"""
    print("üìÇ Chargement des donn√©es...")
    df = pd.read_csv('../data/car-details.csv')
    print(f"‚úÖ Donn√©es charg√©es : {df.shape[0]} lignes, {df.shape[1]} colonnes")
    print(df.head())
    return df

def basic_info(df):
    """Informations de base"""
    print("\n" + "="*50)
    print("üìä INFORMATIONS G√âN√âRALES")
    print("="*50)
    print(f"\nüìã Colonnes : {df.columns.tolist()}")
    print(f"\nüìè Dimensions : {df.shape}")
    print(f"\nüî¢ Types de donn√©es :")
    print(df.dtypes)
    print(f"\n‚ùì Valeurs manquantes :")
    print(df.isnull().sum())
    print(f"\nüîÅ Doublons : {df.duplicated().sum()}")

def statistical_summary(df):
    """R√©sum√© statistique"""
    print("\n" + "="*50)
    print("üìà STATISTIQUES DESCRIPTIVES")
    print("="*50)
    print(df.describe())
    
    if 'selling_price' in df.columns:
        print(f"\nüí∞ Prix :")
        print(f"  - Minimum : {df['selling_price'].min():,.2f}")
        print(f"  - Maximum : {df['selling_price'].max():,.2f}")
        print(f"  - Moyenne : {df['selling_price'].mean():,.2f}")
        print(f"  - M√©diane : {df['selling_price'].median():,.2f}")

def categorical_analysis(df):
    """Analyse des variables cat√©gorielles"""
    print("\n" + "="*50)
    print("üè∑Ô∏è  VARIABLES CAT√âGORIELLES")
    print("="*50)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        print(f"\nüìä Distribution de '{col}' :")
        print(df[col].value_counts().head(10))
        print(f"   Nombre de valeurs uniques : {df[col].nunique()}")

def correlation_analysis(df):
    """Analyse des corr√©lations"""
    print("\n" + "="*50)
    print("üîó CORR√âLATIONS")
    print("="*50)
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 1:
        print("\nMatrice de corr√©lation :")
        corr = df[numeric_cols].corr()
        print(corr)
        
        if 'selling_price' in corr.columns:
            print(f"\nüí∞ Corr√©lation avec le Prix :")
            price_corr = corr['selling_price'].sort_values(ascending=False)
            for feature, corr_value in price_corr.items():
                if feature != 'selling_price':
                    print(f"  - {feature}: {corr_value:.4f}")

def visualizations(df):
    """Cr√©er des visualisations"""
    print("\n" + "="*50)
    print("üìä G√âN√âRATION DES VISUALISATIONS")
    print("="*50)
    
    # Cr√©er un dossier pour les graphiques
    import os
    os.makedirs('visualizations', exist_ok=True)
    
    # 1. Distribution du prix
    if 'selling_price' in df.columns:
        plt.figure(figsize=(10, 6))
        plt.hist(df['selling_price'], bins=50, edgecolor='black', alpha=0.7)
        plt.xlabel('Prix')
        plt.ylabel('Fr√©quence')
        plt.title('Distribution des Prix')
        plt.savefig('visualizations/price_distribution.png', dpi=150, bbox_inches='tight')
        plt.close()
        print("‚úÖ Graphique sauvegard√© : price_distribution.png")
    
    # 2. Prix par marque (top 10)
    if 'selling_price' in df.columns and 'company' in df.columns:
        top_companies = df['company'].value_counts().head(10).index
        df_top = df[df['company'].isin(top_companies)]

        plt.figure(figsize=(12, 6))
        df_top.groupby('company')['selling_price'].mean().sort_values().plot(kind='barh')
        plt.xlabel('Prix Moyen')
        plt.ylabel('Marque')
        plt.title('Prix Moyen par Marque (Top 10)')
        plt.tight_layout()
        plt.savefig('visualizations/price_by_company.png', dpi=150, bbox_inches='tight')
        plt.close()
        print("‚úÖ Graphique sauvegard√© : price_by_company.png")
    
    # 3. Prix vs Kilom√©trage
    if 'selling_price' in df.columns and 'km_driven' in df.columns:
        plt.figure(figsize=(10, 6))
        plt.scatter(df['km_driven'], df['selling_price'], alpha=0.5)
        plt.xlabel('Kilom√©trage')
        plt.ylabel('Prix')
        plt.title('Prix vs Kilom√©trage')
        plt.savefig('visualizations/price_vs_kms.png', dpi=150, bbox_inches='tight')
        plt.close()
        print("‚úÖ Graphique sauvegard√© : price_vs_kms.png")
    
    # 4. Prix vs Ann√©e
    if 'selling_price' in df.columns and 'year' in df.columns:
        plt.figure(figsize=(10, 6))
        df.groupby('year')['selling_price'].mean().plot(kind='line', marker='o')
        plt.xlabel('Ann√©e')
        plt.ylabel('Prix Moyen')
        plt.title('Prix Moyen par Ann√©e')
        plt.grid(True, alpha=0.3)
        plt.savefig('visualizations/price_by_year.png', dpi=150, bbox_inches='tight')
        plt.close()
        print("‚úÖ Graphique sauvegard√© : price_by_year.png")
    
    # 5. Distribution du type de carburant
    if 'fuel' in df.columns:
        plt.figure(figsize=(8, 8))
        df['fuel'].value_counts().plot(kind='pie', autopct='%1.1f%%')
        plt.title('Distribution du Type de Carburant')
        plt.ylabel('')
        plt.savefig('visualizations/fuel_type_distribution.png', dpi=150, bbox_inches='tight')
        plt.close()
        print("‚úÖ Graphique sauvegard√© : fuel_type_distribution.png")
    
    print(f"\nüìÅ Tous les graphiques sont dans le dossier 'visualizations/'")

def main():
    """Fonction principale"""
    print("\n" + "="*50)
    print("üöó ANALYSE EXPLORATOIRE - CarPriceML")
    print("="*50)
    
    # Charger les donn√©es
    df = load_data()
    
    # Analyses
    basic_info(df)
    statistical_summary(df)
    categorical_analysis(df)
    correlation_analysis(df)
    visualizations(df)
    
    print("\n" + "="*50)
    print("‚úÖ ANALYSE TERMIN√âE")
    print("="*50)

if __name__ == "__main__":
    main()


üöó ANALYSE EXPLORATOIRE - CarPriceML
üìÇ Chargement des donn√©es...
‚úÖ Donn√©es charg√©es : 6926 lignes, 16 colonnes
                           name  company  model           edition  year  \
0        Maruti Swift Dzire VDI   Maruti  Swift         Dzire VDI  2014   
1  Skoda Rapid 1.5 TDI Ambition    Skoda  Rapid  1.5 TDI Ambition  2014   
2      Honda City 2017-2020 EXi    Honda   City     2017-2020 EXi  2006   
3     Hyundai i20 Sportz Diesel  Hyundai    i20     Sportz Diesel  2010   
4        Maruti Swift VXI BSIII   Maruti  Swift         VXI BSIII  2007   

    owner    fuel seller_type transmission  km_driven  mileage_mpg  engine_cc  \
0   First  Diesel  Individual       Manual     145500        55.00     1248.0   
1  Second  Diesel  Individual       Manual     120000        49.70     1498.0   
2   Third  Petrol  Individual       Manual     140000        41.60     1497.0   
3   First  Diesel  Individual       Manual     127000        54.06     1396.0   
4   First  Petrol  Ind