LECTURE DES CSV

In [15]:
# --- Import des bibliothèques ---
import pandas as pd
from pathlib import Path

# --- Définition du chemin vers les données brutes (data/raw) ---
# --- Vérification de l’existence du dossier contenant les fichiers sources ---
DATA_DIR = Path.cwd().parent / "data" / "raw"
DATA_DIR.exists(), DATA_DIR


(True,
 WindowsPath('c:/Data/Semaine-9 février 2026/olist-data-cleaning/data/raw'))

In [16]:
# --- Inventaire des fichiers sources (CSV --> "raw") ---
csv_files = sorted(DATA_DIR.glob("*.csv"))
[len(csv_files), [f.name for f in csv_files]]

[9,
 ['olist_customers_dataset.csv',
  'olist_geolocation_dataset.csv',
  'olist_order_items_dataset.csv',
  'olist_order_payments_dataset.csv',
  'olist_order_reviews_dataset.csv',
  'olist_orders_dataset.csv',
  'olist_products_dataset.csv',
  'olist_sellers_dataset.csv',
  'product_category_name_translation.csv']]

--- TABLE "CENTRALE" --> ORDERS --- 

Inspection rapide du dataset
Objectif : comprendre la structure brute, sans interprétation métier

In [17]:
# --- Aperçu des premières lignes du dataframe ORDERS [Méthode head()]---
orders = pd.read_csv(DATA_DIR /"olist_orders_dataset.csv")
orders.head(3).transpose()


Unnamed: 0,0,1,2
order_id,e481f51cbdc54678b7cc49136f2d6af7,53cdb2fc8bc7dce0b6741e2150273451,47770eb9100c2d0c44946d9cf07ec65d
customer_id,9ef432eb6251297304e76186b10a928d,b0830fb4747a6c6d20dea0b8c802d7ef,41ce2a54c0b03bf3443c3d931a367089
order_status,delivered,delivered,delivered
order_purchase_timestamp,2017-10-02 10:56:33,2018-07-24 20:41:37,2018-08-08 08:38:49
order_approved_at,2017-10-02 11:07:15,2018-07-26 03:24:27,2018-08-08 08:55:23
order_delivered_carrier_date,2017-10-04 19:55:00,2018-07-26 14:31:00,2018-08-08 13:50:00
order_delivered_customer_date,2017-10-10 21:25:13,2018-08-07 15:27:45,2018-08-17 18:06:29
order_estimated_delivery_date,2017-10-18 00:00:00,2018-08-13 00:00:00,2018-09-04 00:00:00


In [18]:
# --- Affichage des dimensions [Attribut shape] ---
orders.shape

(99441, 8)

In [19]:
# --- Affichage de la liste des noms de colonne [Attribut columns] ---
orders.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object')

In [20]:
# --- Vérification du type des données ---
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB


In [21]:
# --- Identification des colonnes avec valeurs manquantes ---
orders.isna().sum().sort_values(ascending=False).head(10)

order_delivered_customer_date    2965
order_delivered_carrier_date     1783
order_approved_at                 160
order_id                            0
order_purchase_timestamp            0
order_status                        0
customer_id                         0
order_estimated_delivery_date       0
dtype: int64

In [22]:
# --- Identification des colonnes à forte cardinalité (nombreuses valeurs uniques) ---
orders.nunique().sort_values(ascending=False).head(10)

order_id                         99441
customer_id                      99441
order_purchase_timestamp         98875
order_delivered_customer_date    95664
order_approved_at                90733
order_delivered_carrier_date     81018
order_estimated_delivery_date      459
order_status                         8
dtype: int64

EXPLORATION DU DATASET

In [23]:
# --- Chargement de tous les CSV dans un dictionnaire dfs ---
import pandas as pd

dfs = {}

for f in csv_files:
    name = f.stem.replace("olist_", "").replace("_dataset", "")
    dfs[name] = pd.read_csv(f)

list(dfs.keys())


['customers',
 'geolocation',
 'order_items',
 'order_payments',
 'order_reviews',
 'orders',
 'products',
 'sellers',
 'product_category_name_translation']

In [24]:
# --- Dimensions de chaque table ---
{name: df.shape for name, df in dfs.items()}

{'customers': (99441, 5),
 'geolocation': (1000163, 5),
 'order_items': (112650, 7),
 'order_payments': (103886, 5),
 'order_reviews': (99224, 7),
 'orders': (99441, 8),
 'products': (32951, 9),
 'sellers': (3095, 4),
 'product_category_name_translation': (71, 2)}

In [25]:
# --- Construction d’un tableau récapitulatif des datasets / Vue d’ensemble de la structure des tables ---

# - nombre de lignes et de colonnes
# - volume total de valeurs manquantes
# - répartition des types de données


summary_rows = []

for name, df in dfs.items():
    dtypes = df.dtypes.astype(str).value_counts().to_dict()
    summary_rows.append({
        "table": name,
        "rows": df.shape[0],
        "cols": df.shape[1],
        "null_cells": int(df.isna().sum().sum()),
        "dtypes_counts": dtypes,
    })

summary = pd.DataFrame(summary_rows).sort_values("table").reset_index(drop=True)
summary

Unnamed: 0,table,rows,cols,null_cells,dtypes_counts
0,customers,99441,5,0,"{'object': 4, 'int64': 1}"
1,geolocation,1000163,5,0,"{'float64': 2, 'object': 2, 'int64': 1}"
2,order_items,112650,7,0,"{'object': 4, 'float64': 2, 'int64': 1}"
3,order_payments,103886,5,0,"{'object': 2, 'int64': 2, 'float64': 1}"
4,order_reviews,99224,7,145903,"{'object': 6, 'int64': 1}"
5,orders,99441,8,4908,{'object': 8}
6,product_category_name_translation,71,2,0,{'object': 2}
7,products,32951,9,2448,"{'float64': 7, 'object': 2}"
8,sellers,3095,4,0,"{'object': 3, 'int64': 1}"


In [26]:
# --- Structure détaillée de chaque dataset : ---

# - dimensions (lignes, colonnes)
# - liste des colonnes et types 

for name, df in dfs.items():
    print("\n" + "="*80)
    print(f"{name.upper()}  |  shape={df.shape}")
    print("- Columns & dtypes:")
    print(df.dtypes)


CUSTOMERS  |  shape=(99441, 5)
- Columns & dtypes:
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

GEOLOCATION  |  shape=(1000163, 5)
- Columns & dtypes:
geolocation_zip_code_prefix      int64
geolocation_lat                float64
geolocation_lng                float64
geolocation_city                object
geolocation_state               object
dtype: object

ORDER_ITEMS  |  shape=(112650, 7)
- Columns & dtypes:
order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

ORDER_PAYMENTS  |  shape=(103886, 5)
- Columns & dtypes:
order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value         

In [27]:
# --- Détection automatique des colonnes pouvant contenir des dates --- 
# --- Objectif : repérer les colonnes à convertir plus tard en format datetime ---

date_candidates = {}

for name, df in dfs.items():
    cols = [c for c in df.columns if "date" in c.lower() or "timestamp" in c.lower()]
    date_candidates[name] = cols

date_candidates

{'customers': [],
 'geolocation': [],
 'order_items': ['shipping_limit_date'],
 'order_payments': [],
 'order_reviews': ['review_creation_date', 'review_answer_timestamp'],
 'orders': ['order_purchase_timestamp',
  'order_delivered_carrier_date',
  'order_delivered_customer_date',
  'order_estimated_delivery_date'],
 'products': [],
 'sellers': [],
 'product_category_name_translation': []}

CONTRÔLE DE LA QUALITÉ DES DATASETS

In [28]:
# --- Contrôle d’unicité des clés primaires / Vérification de l’absence de doublons ---

primary_keys = {
    "customers": ["customer_id"],
    "orders": ["order_id"],
    "order_items": ["order_id", "order_item_id"],
    "products": ["product_id"],
    "sellers": ["seller_id"],
    "order_reviews": ["review_id"],
}

pk_checks = []

for table, cols in primary_keys.items():
    df = dfs[table]
    duplicated = df.duplicated(subset=cols).sum()
    pk_checks.append({
        "table": table,
        "pk": cols,
        "duplicates": duplicated
    })

pd.DataFrame(pk_checks)


Unnamed: 0,table,pk,duplicates
0,customers,[customer_id],0
1,orders,[order_id],0
2,order_items,"[order_id, order_item_id]",0
3,products,[product_id],0
4,sellers,[seller_id],0
5,order_reviews,[review_id],814


In [29]:
# --- Vérification des valeurs nulles sur colonnes critiques ---
critical_columns = {
    "customers": ["customer_id", "customer_unique_id"],
    "orders": ["order_id", "customer_id"],
    "order_items": ["order_id", "product_id", "seller_id"],
    "products": ["product_id"],
    "sellers": ["seller_id"],
}

null_checks = []

for table, cols in critical_columns.items():
    df = dfs[table]
    for col in cols:
        null_checks.append({
            "table": table,
            "column": col,
            "null_count": int(df[col].isna().sum())
        })

pd.DataFrame(null_checks)


Unnamed: 0,table,column,null_count
0,customers,customer_id,0
1,customers,customer_unique_id,0
2,orders,order_id,0
3,orders,customer_id,0
4,order_items,order_id,0
5,order_items,product_id,0
6,order_items,seller_id,0
7,products,product_id,0
8,sellers,seller_id,0


--- Vérification de la cohérence des jointures (existence des FK) ---

In [30]:
# --- orders → customers ---
orders_customers_missing = (
    dfs["orders"]["customer_id"]
    .isin(dfs["customers"]["customer_id"])
    .value_counts()
)

orders_customers_missing

customer_id
True    99441
Name: count, dtype: int64

In [31]:
# --- order_items → orders ---
order_items_orders_missing = (
    dfs["order_items"]["order_id"]
    .isin(dfs["orders"]["order_id"])
    .value_counts()
)

order_items_orders_missing

order_id
True    112650
Name: count, dtype: int64

In [32]:
# --- order_items → products ---
order_items_products_missing = (
    dfs["order_items"]["product_id"]
    .isin(dfs["products"]["product_id"])
    .value_counts()
)

order_items_products_missing

product_id
True    112650
Name: count, dtype: int64

In [33]:
# --- Vérification des doublons existants (hors PK) ---
duplicate_rows = {
    table: int(df.duplicated().sum())
    for table, df in dfs.items()
}

duplicate_rows

{'customers': 0,
 'geolocation': 261831,
 'order_items': 0,
 'order_payments': 0,
 'order_reviews': 0,
 'orders': 0,
 'products': 0,
 'sellers': 0,
 'product_category_name_translation': 0}

In [34]:
# --- Initialisation d’un journal d’observations pour documenter les problèmes de qualité détectés dans les données ---

quality_observations = []

def observe(table, observation):
    quality_observations.append({
        "table": table,
        "observation": observation
    })