In [14]:
pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [15]:
import duckdb
import os
from pathlib import Path
import pandas as pd
import numpy as np
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [16]:
# Configuration
DB_NAME = "amazing.duckdb"
data_folder = Path("./data")

con = duckdb.connect(DB_NAME)

In [17]:
def get_file_paths() -> List[str]:
    return sorted([str(p.resolve()) for p in data_folder.glob("*.csv")])

In [18]:
def load_new_files(con, file_paths: List[str]):
    print(f"Début du chargement de {len(file_paths)} fichier(s)...\n")
    for i, path in enumerate(file_paths, 1):
        filename = os.path.basename(path)

        print(f"[{i}/{len(file_paths)}] Vérification de {filename}...")
        already_loaded = con.execute(
            "SELECT 1 FROM loaded_files WHERE filename = ?", [filename]
        ).fetchone()

        if already_loaded:
            print(f"{filename} déjà chargé. Ignoré.\n")
            continue

        print(f"⬆Chargement de {filename} dans all_events...")
        con.execute(f"""
            INSERT INTO all_eventsp
            SELECT * FROM read_csv_auto('{path}', AUTO_DETECT=TRUE, SAMPLE_SIZE=-1)
        """)
        con.execute("INSERT INTO loaded_files VALUES (?)", [filename])
        print(f"{filename} ajouté avec succès à la base.\n")

    print("Chargement terminé.\n")

In [19]:
def init_loaded_table(con):
    print("📁 Initialisation de la table 'loaded_files'...")
    con.execute("""
        CREATE TABLE IF NOT EXISTS loaded_files (
            filename TEXT PRIMARY KEY
        );
    """)
    print("✅ Table 'loaded_files' prête.\n")

In [20]:
def create_all_events_table(con):
    con.execute("""
        CREATE TABLE IF NOT EXISTS all_events (
            event_time TIMESTAMP,
            event_type TEXT,
            product_id TEXT,
            category_id TEXT,
            category_code TEXT,
            brand TEXT,
            price DOUBLE,
            user_id TEXT,
            user_session TEXT
        );
    """)

In [21]:
def diagnostic(con):
    print(con.execute("SELECT COUNT(*) FROM all_events").fetchall())
    print(con.execute("SELECT * FROM all_events LIMIT 5").fetchdf())

In [22]:
def generate_user_features(con):
    print("Génération des features users...")
    con.execute("""
        CREATE OR REPLACE TABLE user_features AS
        SELECT
            user_id,
            COUNT(*) AS nb_events,
            COUNT(DISTINCT user_session) AS nb_sessions,

            -- Détail des types d'événements
            COUNT(CASE WHEN event_type = 'view' THEN 1 END) AS nb_view,
            COUNT(CASE WHEN event_type = 'cart' THEN 1 END) AS nb_cart,
            COUNT(CASE WHEN event_type = 'remove_from_cart' THEN 1 END) AS nb_remove,
            COUNT(CASE WHEN event_type = 'purchase' THEN 1 END) AS nb_achats,

            SUM(CASE WHEN event_type = 'purchase' THEN price ELSE 0 END) AS total_depense,
            AVG(CASE WHEN event_type = 'purchase' THEN price ELSE NULL END) AS avg_price,
            COUNT(DISTINCT category_code) AS nb_categories_visitees,
            MODE() WITHIN GROUP (ORDER BY category_code) AS category_top

        FROM (
            SELECT * FROM all_events
            WHERE user_id IS NOT NULL
            LIMIT 100000
        )
        GROUP BY user_id;
    """)
    print(" Table user_features créée")

In [27]:

# 2. Initialisation des tables AVANT d’y accéder
init_loaded_table(con)
create_all_events_table(con)

# 3. Chargement des fichiers
files = get_file_paths()
load_new_files(con, files)

create_all_events_table(con)
diagnostic(con)
generate_user_features(con)

📁 Initialisation de la table 'loaded_files'...
✅ Table 'loaded_files' prête.

Début du chargement de 7 fichier(s)...

[1/7] Vérification de 2019-Dec.csv...
2019-Dec.csv déjà chargé. Ignoré.

[2/7] Vérification de 2019-Nov.csv...
2019-Nov.csv déjà chargé. Ignoré.

[3/7] Vérification de 2019-Oct.csv...
2019-Oct.csv déjà chargé. Ignoré.

[4/7] Vérification de 2020-Apr.csv...
2020-Apr.csv déjà chargé. Ignoré.

[5/7] Vérification de 2020-Feb.csv...
2020-Feb.csv déjà chargé. Ignoré.

[6/7] Vérification de 2020-Jan.csv...
2020-Jan.csv déjà chargé. Ignoré.

[7/7] Vérification de 2020-Mar.csv...
2020-Mar.csv déjà chargé. Ignoré.

Chargement terminé.

[(411709736,)]
           event_time event_type product_id          category_id  \
0 2019-12-01 00:00:00       view    1005105  2232732093077520756   
1 2019-12-01 00:00:00       view   22700068  2232732091643068746   
2 2019-12-01 00:00:01       view    2402273  2232732100769874463   
3 2019-12-01 00:00:02   purchase   26400248  205301355305657984

In [28]:
nb_users = con.execute("SELECT COUNT(*) FROM all_events").fetchone()[0]

print(f"Taille de la table user_features : {nb_users} events")


Taille de la table user_features : 411709736 events


In [29]:
df_acheteurs = con.execute("""
    SELECT * FROM user_features WHERE nb_achats > 0
""").fetchdf()

df_curieux = con.execute("""
    SELECT * FROM user_features
    WHERE nb_view > 0 AND nb_achats = 0
""").fetchdf()


In [30]:
df_user_features = con.execute("SELECT * FROM user_features").fetchdf()
df_filtered = df_user_features.query("nb_events >= 5 and nb_achats > 0").copy()
features = df_filtered.select_dtypes(include=["number"]).fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df_filtered["cluster"] = kmeans.fit_predict(X_scaled)
df_filtered[["user_id", "cluster"]].head()


Unnamed: 0,user_id,cluster
44,579976256,1
88,579982842,3
204,528451755,1
216,513539969,1
243,577914229,1


In [31]:
con.execute("DESCRIBE user_features").fetchdf()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,user_id,VARCHAR,YES,,,
1,nb_events,BIGINT,YES,,,
2,nb_sessions,BIGINT,YES,,,
3,nb_view,BIGINT,YES,,,
4,nb_cart,BIGINT,YES,,,
5,nb_remove,BIGINT,YES,,,
6,nb_achats,BIGINT,YES,,,
7,total_depense,DOUBLE,YES,,,
8,avg_price,DOUBLE,YES,,,
9,nb_categories_visitees,BIGINT,YES,,,


In [32]:
df_cat_visits = con.execute("""
    SELECT
        user_id,
        category_code,
        COUNT(*) AS nb_visites
    FROM all_events
    WHERE event_type = 'view'
      AND category_code IS NOT NULL
    GROUP BY user_id, category_code
    ORDER BY user_id, nb_visites DESC
""").fetchdf()

In [33]:
print("Aperçu des acheteurs :")
print(df_acheteurs.head())

print("Aperçu des visiteurs :")
print(df_curieux.head())

Aperçu des acheteurs :
     user_id  nb_events  nb_sessions  nb_view  nb_cart  nb_remove  nb_achats  \
0  579976256         15            2       11        3          0          1   
1  573380565          4            2        2        1          0          1   
2  579982842         20            1       15        3          0          2   
3  562265691          3            1        1        1          0          1   
4  528451755         13            2       11        1          0          1   

   total_depense  avg_price  nb_categories_visitees  \
0         405.84     405.84                       3   
1         867.43     867.43                       1   
2          72.02      36.01                       7   
3         131.25     131.25                       1   
4         110.17     110.17                       3   

                   category_top  
0  appliances.personal.massager  
1      construction.tools.light  
2                 apparel.shirt  
3      construction.tools.lig