In [None]:
! pip install -r requirements.txt

In [None]:
import duckdb
import os
from pathlib import Path
from typing import List

In [None]:
#1. Configuration
DB_NAME = "amazing.duckdb"
data_folder = Path("./data")

con = duckdb.connect(DB_NAME)

In [None]:
def get_file_paths() -> List[str]:
    return sorted([str(p.resolve()) for p in data_folder.glob("*.csv")])

def load_new_files(con, file_paths: List[str]):
    print(f"Début du chargement de {len(file_paths)} fichier(s)...\n")
    for i, path in enumerate(file_paths, 1):
        filename = os.path.basename(path)

        print(f"[{i}/{len(file_paths)}] Vérification de {filename}...")
        already_loaded = con.execute(
            "SELECT 1 FROM loaded_files WHERE filename = ?", [filename]
        ).fetchone()

        if already_loaded:
            print(f"{filename} déjà chargé. Ignoré.\n")
            continue

        print(f"⬆Chargement de {filename} dans all_events...")
        con.execute(f"""
            INSERT INTO all_events
            SELECT * FROM read_csv_auto('{path}', AUTO_DETECT=TRUE, SAMPLE_SIZE=-1)
        """)
        con.execute("INSERT INTO loaded_files VALUES (?)", [filename])
        print(f"{filename} ajouté avec succès à la base.\n")

    print("Chargement terminé.\n")

def init_loaded_table(con):
    print("Initialisation de la table 'loaded_files'...")
    con.execute("""
        CREATE TABLE IF NOT EXISTS loaded_files (
            filename TEXT PRIMARY KEY
        );
    """)
    print("Table 'loaded_files' prête.\n")

def create_all_events_table(con):
    con.execute("""
        CREATE TABLE IF NOT EXISTS all_events (
            event_time TIMESTAMP,
            event_type TEXT,
            product_id TEXT,
            category_id TEXT,
            category_code TEXT,
            brand TEXT,
            price DOUBLE,
            user_id TEXT,
            user_session TEXT
        );
    """)


In [None]:

# Initialisation des tables
init_loaded_table(con)
create_all_events_table(con)

# Chargement des fichiers
files = get_file_paths()
load_new_files(con, files)

# Test génération de la table events_tables
create_all_events_table(con)


In [None]:
nb_users = con.execute("SELECT COUNT(*) FROM all_events").fetchone()[0]

print(f"Taille de la table all_events : {nb_users} logs")

df_purchase = con.execute("SELECT * FROM all_events WHERE user_id = '535135317' LIMIT 10").fetch_df()
print(df_purchase)


In [11]:
# Fermeture de la connexion DuckDB
con.close()