In [None]:
pip install -r requirements.txt

In [4]:
import duckdb
import os
from pathlib import Path
import pandas as pd
import numpy as np
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
# Configuration
DB_NAME = "amazing.duckdb"
data_folder = Path("./data")

con = duckdb.connect(DB_NAME)

In [6]:
def get_file_paths() -> List[str]:
    return sorted([str(p.resolve()) for p in data_folder.glob("*.csv")])

In [7]:
def load_new_files(con, file_paths: List[str]):
    for path in file_paths:
        filename = os.path.basename(path)
        already_loaded = con.execute("SELECT 1 FROM loaded_files WHERE filename = ?", [filename]).fetchone()
        if already_loaded:
            continue
        con.execute(f"""
            INSERT INTO all_events
            SELECT * FROM read_csv_auto('{path}', AUTO_DETECT=TRUE, SAMPLE_SIZE=-1)
        """)
        con.execute("INSERT INTO loaded_files VALUES (?)", [filename])

In [8]:
def init_loaded_table(con):
    con.execute("""
        CREATE TABLE IF NOT EXISTS loaded_files (
            filename TEXT PRIMARY KEY
        );
    """)

In [9]:
def create_all_events_table(con):
    con.execute("""
        CREATE TABLE IF NOT EXISTS all_events (
            event_time TIMESTAMP,
            event_type TEXT,
            product_id TEXT,
            category_id TEXT,
            category_code TEXT,
            brand TEXT,
            price DOUBLE,
            user_id TEXT,
            user_session TEXT
        );
    """)

In [10]:
def diagnostic(con):
    print(con.execute("SELECT COUNT(*) FROM all_events").fetchall())
    print(con.execute("SELECT * FROM all_events LIMIT 5").fetchdf())

In [11]:
def generate_user_features(con):
    print("Génération des features users...")
    con.execute("""
        CREATE OR REPLACE TABLE user_features AS
        SELECT
            user_id,
            COUNT(*) AS nb_events,
            COUNT(DISTINCT user_session) AS nb_sessions,

            -- Détail des types d'événements
            COUNT(CASE WHEN event_type = 'view' THEN 1 END) AS nb_view,
            COUNT(CASE WHEN event_type = 'cart' THEN 1 END) AS nb_cart,
            COUNT(CASE WHEN event_type = 'remove_from_cart' THEN 1 END) AS nb_remove,
            COUNT(CASE WHEN event_type = 'purchase' THEN 1 END) AS nb_achats,

            SUM(CASE WHEN event_type = 'purchase' THEN price ELSE 0 END) AS total_depense,
            AVG(CASE WHEN event_type = 'purchase' THEN price ELSE NULL END) AS avg_price,
            COUNT(DISTINCT category_code) AS nb_categories_visitees,
            MODE() WITHIN GROUP (ORDER BY category_code) AS category_top

        FROM (
            SELECT * FROM all_events
            WHERE user_id IS NOT NULL
            LIMIT 100000
        )
        GROUP BY user_id;
    """)
    print(" Table user_features créée")

In [12]:
nb_users = con.execute("SELECT COUNT(*) FROM all_events").fetchone()[0]
print(f"Taille de la table user_features : {nb_users} utilisateurs")

Taille de la table user_features : 411709736 utilisateurs


In [13]:
df_acheteurs = con.execute("""
    SELECT * FROM user_features WHERE nb_achats > 0
""").fetchdf()

df_curieux = con.execute("""
    SELECT * FROM user_features
    WHERE nb_view > 0 AND nb_achats = 0
""").fetchdf()


In [14]:
df_user_features = con.execute("SELECT * FROM user_features").fetchdf()
df_filtered = df_user_features.query("nb_events >= 5 and nb_achats > 0").copy()
features = df_filtered.select_dtypes(include=["number"]).fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df_filtered["cluster"] = kmeans.fit_predict(X_scaled)
df_filtered[["user_id", "cluster"]].head()


Unnamed: 0,user_id,cluster
31,520175111,3
46,515384420,4
111,512607090,1
120,545340283,1
127,521005401,1


In [15]:
con.execute("DESCRIBE user_features").fetchdf()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,user_id,VARCHAR,YES,,,
1,nb_events,BIGINT,YES,,,
2,nb_sessions,BIGINT,YES,,,
3,nb_view,BIGINT,YES,,,
4,nb_cart,BIGINT,YES,,,
5,nb_remove,BIGINT,YES,,,
6,nb_achats,BIGINT,YES,,,
7,total_depense,DOUBLE,YES,,,
8,avg_price,DOUBLE,YES,,,
9,nb_categories_visitees,BIGINT,YES,,,


In [16]:
df_cat_visits = con.execute("""
    SELECT
        user_id,
        category_code,
        COUNT(*) AS nb_visites
    FROM all_events
    WHERE event_type = 'view'
      AND category_code IS NOT NULL
    GROUP BY user_id, category_code
    ORDER BY user_id, nb_visites DESC
""").fetchdf()

In [18]:
print("👛 Aperçu des acheteurs :")
print(df_acheteurs.head())

print("Aperçu des visiteurs :")
print(df_curieux.head())

👛 Aperçu des acheteurs :
     user_id  nb_events  nb_sessions  nb_view  nb_cart  nb_remove  nb_achats  \
0  520175111         20            3       19        0          0          1   
1  555461983          4            3        3        0          0          1   
2  515384420         10            1        4        5          0          1   
3  524102815          2            1        1        0          0          1   
4  512607090          7            1        6        0          0          1   

   total_depense  avg_price  nb_categories_visitees            category_top  
0          38.35      38.35                       2      electronics.clocks  
1          61.48      61.48                       0                    None  
2        1747.79    1747.79                       1  electronics.smartphone  
3         223.43     223.43                       1  electronics.smartphone  
4         134.42     134.42                       1  electronics.smartphone  
Aperçu des visiteurs :
   