In [16]:
! pip install -r requirements.txt



In [17]:
import duckdb
import os
from pathlib import Path
from typing import List

In [18]:
#1. Configuration
DB_NAME = "amazing.duckdb"
data_folder = Path("./data")

con = duckdb.connect(DB_NAME)

In [19]:
def get_file_paths() -> List[str]:
    return sorted([str(p.resolve()) for p in data_folder.glob("*.csv")])

def load_new_files(con, file_paths: List[str]):
    print(f"Début du chargement de {len(file_paths)} fichier(s)...\n")
    for i, path in enumerate(file_paths, 1):
        filename = os.path.basename(path)

        print(f"[{i}/{len(file_paths)}] Vérification de {filename}...")
        already_loaded = con.execute(
            "SELECT 1 FROM loaded_files WHERE filename = ?", [filename]
        ).fetchone()

        if already_loaded:
            print(f"{filename} déjà chargé. Ignoré.\n")
            continue

        print(f"⬆Chargement de {filename} dans all_events...")
        con.execute(f"""
            INSERT INTO all_events
            SELECT * FROM read_csv_auto('{path}', AUTO_DETECT=TRUE, SAMPLE_SIZE=-1)
        """)
        con.execute("INSERT INTO loaded_files VALUES (?)", [filename])
        print(f"{filename} ajouté avec succès à la base.\n")

    print("Chargement terminé.\n")

def init_loaded_table(con):
    print("Initialisation de la table 'loaded_files'...")
    con.execute("""
        CREATE TABLE IF NOT EXISTS loaded_files (
            filename TEXT PRIMARY KEY
        );
    """)
    print("Table 'loaded_files' prête.\n")

def create_all_events_table(con):
    con.execute("""
        CREATE TABLE IF NOT EXISTS all_events (
            event_time TIMESTAMP,
            event_type TEXT,
            product_id TEXT,
            category_id TEXT,
            category_code TEXT,
            brand TEXT,
            price DOUBLE,
            user_id TEXT,
            user_session TEXT
        );
    """)


In [20]:

# Initialisation des tables
init_loaded_table(con)
create_all_events_table(con)

# Chargement des fichiers
files = get_file_paths()
load_new_files(con, files)

# Test génération de la table events_tables
create_all_events_table(con)


Initialisation de la table 'loaded_files'...
Table 'loaded_files' prête.

Début du chargement de 1 fichier(s)...

[1/1] Vérification de 2019-Oct.csv...
⬆Chargement de 2019-Oct.csv dans all_events...
2019-Oct.csv ajouté avec succès à la base.

Chargement terminé.



In [21]:
nb_users = con.execute("SELECT COUNT(*) FROM all_events").fetchone()[0]

print(f"Taille de la table all_events : {nb_users} logs")

df_purchase = con.execute("SELECT * FROM all_events WHERE user_id = '535135317' LIMIT 10").fetch_df()
print(df_purchase)


Taille de la table all_events : 42448764 logs
           event_time event_type product_id          category_id  \
0 2019-10-02 09:37:59       view    1801761  2053013554415534427   
1 2019-10-02 09:38:24       view    1801881  2053013554415534427   
2 2019-10-02 09:39:09       view    1801881  2053013554415534427   
3 2019-10-02 09:40:12       view    1801881  2053013554415534427   
4 2019-10-02 09:42:53       view    1004659  2053013555631882655   
5 2019-10-02 09:46:19       view   26400629  2053013563651392361   
6 2019-10-02 09:53:27       view   12600013  2053013554751078769   
7 2019-10-02 09:54:17   purchase   12600013  2053013554751078769   
8 2019-10-02 09:54:45       view   12600013  2053013554751078769   

              category_code    brand    price    user_id  \
0      electronics.video.tv    artel  1714.07  535135317   
1      electronics.video.tv  samsung   506.27  535135317   
2      electronics.video.tv  samsung   506.27  535135317   
3      electronics.video.tv  sams

In [22]:
# Fermeture de la connexion DuckDB
con.close()