In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_parquet('../data/parquet/full_2020.csv.parquet', engine='pyarrow')


In [4]:
df.head()

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_nom_voie,adresse_code_voie,code_postal,code_commune,nom_commune,code_departement,ancien_code_commune,ancien_nom_commune,id_parcelle,ancien_id_parcelle,numero_volume,lot1_numero,lot1_surface_carrez,lot2_numero,lot2_surface_carrez,lot3_numero,lot3_surface_carrez,lot4_numero,lot4_surface_carrez,lot5_numero,lot5_surface_carrez,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,code_nature_culture,nature_culture,code_nature_culture_speciale,nature_culture_speciale,surface_terrain,longitude,latitude
0,2020-1,2020-01-07,1,Vente,8000.0,,,FORTUNAT,B063,1250.0,1072,Ceyzériat,1,,,01072000AK0216,,,,,,,,,,,,,0,,,,,T,terres,,,1061.0,5.323532,46.171941
1,2020-2,2020-01-02,1,Vente,2175.0,,,TERRES DES CINQ SAULES,B124,1290.0,1203,Laiz,1,,,012030000B0004,,,,,,,,,,,,,0,,,,,BT,taillis simples,,,85.0,4.893454,46.251858
2,2020-2,2020-01-02,1,Vente,2175.0,,,BOIS DU CHAMP RION,B006,1290.0,1203,Laiz,1,,,012030000B0173,,,,,,,,,,,,,0,,,,,T,terres,,,1115.0,4.90021,46.235277
3,2020-2,2020-01-02,1,Vente,2175.0,,,EN COROBERT,B025,1290.0,1203,Laiz,1,,,012030000B0477,,,,,,,,,,,,,0,,,,,T,terres,,,1940.0,4.882112,46.246554
4,2020-2,2020-01-02,1,Vente,2175.0,,,TERRES DES CINQ SAULES,B124,1290.0,1203,Laiz,1,,,012030000C0068,,,,,,,,,,,,,0,,,,,T,terres,,,1148.0,4.894481,46.251841


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522416 entries, 0 to 3522415
Data columns (total 40 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   id_mutation                   object 
 1   date_mutation                 object 
 2   numero_disposition            int64  
 3   nature_mutation               object 
 4   valeur_fonciere               float64
 5   adresse_numero                float64
 6   adresse_suffixe               object 
 7   adresse_nom_voie              object 
 8   adresse_code_voie             object 
 9   code_postal                   float64
 10  code_commune                  object 
 11  nom_commune                   object 
 12  code_departement              object 
 13  ancien_code_commune           float64
 14  ancien_nom_commune            object 
 15  id_parcelle                   object 
 16  ancien_id_parcelle            object 
 17  numero_volume                 object 
 18  lot1_numero           

: 

In [3]:
def optimize_dataframe(df, parse_dates=None, category_thresh=0.05, verbose=True):
    """
    Optimise les types d'un DataFrame pour réduire l'utilisation mémoire :
    - convertit les objets en catégories si nombre de modalités faible
    - convertit les float64 en float32
    - convertit les int64 en int32
    - convertit les colonnes de dates

    Parameters:
    - df : DataFrame à optimiser
    - parse_dates : liste de colonnes à parser comme dates
    - category_thresh : seuil max de ratio modalité/nb lignes pour transformer en 'category'
    - verbose : affiche la mémoire gagnée

    Returns:
    - df optimisé
    """

    initial_memory = df.memory_usage(deep=True).sum() / 1024**2

    # Dates
    if parse_dates:
        for col in parse_dates:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    # Float → float32
    float_cols = df.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df[col] = df[col].astype('float32')

    # Int → int32
    int_cols = df.select_dtypes(include=['int64']).columns
    for col in int_cols:
        if df[col].isnull().any():
            df[col] = df[col].astype('Int32')
        else:
            df[col] = df[col].astype('int32')

    # Object → category si peu de modalités
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique(dropna=False) / len(df) <= category_thresh:
            df[col] = df[col].astype('category')

    final_memory = df.memory_usage(deep=True).sum() / 1024**2

    if verbose:
        print(f"💾 Mémoire utilisée : {initial_memory:.2f} Mo → {final_memory:.2f} Mo ({100 * (1 - final_memory/initial_memory):.1f}% gagné)")

    return df

# Application test sur ech_annonces_ventes_68.csv
optimized_df1 = optimize_dataframe(df, parse_dates=['date_mutation'], verbose=True)
optimized_df1.info()


💾 Mémoire utilisée : 3899.09 Mo → 1085.58 Mo (72.2% gagné)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522416 entries, 0 to 3522415
Data columns (total 40 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   id_mutation                   object        
 1   date_mutation                 datetime64[ns]
 2   numero_disposition            int32         
 3   nature_mutation               category      
 4   valeur_fonciere               float32       
 5   adresse_numero                float32       
 6   adresse_suffixe               category      
 7   adresse_nom_voie              object        
 8   adresse_code_voie             category      
 9   code_postal                   float32       
 10  code_commune                  category      
 11  nom_commune                   category      
 12  code_departement              category      
 13  ancien_code_commune           float32       
 14  ancien_nom_commune     

In [8]:
display(df.head())

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_nom_voie,adresse_code_voie,code_postal,code_commune,nom_commune,code_departement,ancien_code_commune,ancien_nom_commune,id_parcelle,ancien_id_parcelle,numero_volume,lot1_numero,lot1_surface_carrez,lot2_numero,lot2_surface_carrez,lot3_numero,lot3_surface_carrez,lot4_numero,lot4_surface_carrez,lot5_numero,lot5_surface_carrez,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,code_nature_culture,nature_culture,code_nature_culture_speciale,nature_culture_speciale,surface_terrain,longitude,latitude
0,2020-1,2020-01-07,1,Vente,8000.0,,,FORTUNAT,B063,1250.0,1072,Ceyzériat,1,,,01072000AK0216,,,,,,,,,,,,,0,,,,,T,terres,,,1061.0,5.323532,46.17194
1,2020-2,2020-01-02,1,Vente,2175.0,,,TERRES DES CINQ SAULES,B124,1290.0,1203,Laiz,1,,,012030000B0004,,,,,,,,,,,,,0,,,,,BT,taillis simples,,,85.0,4.893454,46.251858
2,2020-2,2020-01-02,1,Vente,2175.0,,,BOIS DU CHAMP RION,B006,1290.0,1203,Laiz,1,,,012030000B0173,,,,,,,,,,,,,0,,,,,T,terres,,,1115.0,4.90021,46.235275
3,2020-2,2020-01-02,1,Vente,2175.0,,,EN COROBERT,B025,1290.0,1203,Laiz,1,,,012030000B0477,,,,,,,,,,,,,0,,,,,T,terres,,,1940.0,4.882112,46.246555
4,2020-2,2020-01-02,1,Vente,2175.0,,,TERRES DES CINQ SAULES,B124,1290.0,1203,Laiz,1,,,012030000C0068,,,,,,,,,,,,,0,,,,,T,terres,,,1148.0,4.894481,46.251842


In [12]:
optimized_df2 = optimized_df1.copy()
for col in optimized_df2.select_dtypes(include='category'):
    optimized_df2[col] = optimized_df2[col].astype(str)

optimized_df2.to_parquet('../data/raw/optimized_2020.parquet', index=False)

In [2]:
df2 = pd.read_parquet('../data/raw/optimized_2020.parquet')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522416 entries, 0 to 3522415
Data columns (total 40 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   id_mutation                   object        
 1   date_mutation                 datetime64[ns]
 2   numero_disposition            int32         
 3   nature_mutation               object        
 4   valeur_fonciere               float32       
 5   adresse_numero                float32       
 6   adresse_suffixe               object        
 7   adresse_nom_voie              object        
 8   adresse_code_voie             object        
 9   code_postal                   float32       
 10  code_commune                  object        
 11  nom_commune                   object        
 12  code_departement              object        
 13  ancien_code_commune           float32       
 14  ancien_nom_commune            object        
 15  id_parcelle                   ob