# Start analysis

## Import

In [5]:
from pathlib import Path
import os
import csv
import pandas as pd
import datetime
import sqlite3
import logging


## Locate path 

In [None]:
# Locate the CSV under a relative data folder: try ./data then ../data (if the notebook runs inside ./notebook).
cwd = Path.cwd()
data_dir_candidates = [cwd / "data", cwd.parent / "data"]  # supports both project root and notebook/ as CWD
data_dir = next((p for p in data_dir_candidates if p.exists()), data_dir_candidates[0])
data_dir.mkdir(parents=True, exist_ok=True)

expected_csv_name = "Retail_supply_chain - Retails Order Full Dataset.csv"
csv_path = data_dir / expected_csv_name

if csv_path.exists():
    print(f"OK: CSV found -> {csv_path}")
else:
    # Fallback: try to find a similar CSV recursively under the chosen data_dir
    candidates = list(data_dir.rglob("Retail_supply_chain*Full*Dataset*.csv"))
    if candidates:
        csv_path = candidates[0]
        print(f"Notice: expected file not found, using -> {csv_path}")
    else:
        raise FileNotFoundError(
            f"CSV not found. Expected at: {csv_path}\n"
            f"Place the file in the 'data/' folder at project root."
        )



OK: CSV found -> c:\Users\pc\Desktop\PROJECTS\Supply_chain_analysis\data\Retail_supply_chain - Retails Order Full Dataset.csv


In [6]:
# Create an empty SQLite database file under a relative data folder (./data or ../data).

cwd = Path.cwd()
data_dir_candidates = [cwd / "data", cwd.parent / "data"]
data_dir = next((p for p in data_dir_candidates if p.exists()), data_dir_candidates[0])
data_dir.mkdir(parents=True, exist_ok=True)

db_path = data_dir / "Retail_supply_chain.db"
conn = sqlite3.connect(db_path.as_posix())
conn.close()

print(f"OK: DB created -> {db_path}  | Exists: {db_path.exists()}")


OK: DB created -> c:\Users\pc\Desktop\PROJECTS\Supply_chain_analysis\data\Retail_supply_chain.db  | Exists: True


In [8]:
# Helpers
def to_iso_date(s: str):
    if s is None:
        return None
    s = str(s).strip()
    if not s:
        return None
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%d-%m-%Y", "%m-%d-%Y"):
        try:
            return datetime.datetime.strptime(s, fmt).date().isoformat()
        except Exception:
            continue
    return s  # keep as-is if already ISO or unparseable but non-empty

def parse_float(s):
    if s is None:
        return None
    s = str(s).strip()
    if not s:
        return None
    # Remove currency/percent and spaces (incl. non-breaking)
    for ch in ["€", "%", "\u00A0", " "]:
        s = s.replace(ch, "")
    # Normalize thousands/decimal separators
    if "," in s and "." in s:
        # Assume '.' as thousands and ',' as decimal (e.g., 1.234,56)
        s = s.replace(".", "").replace(",", ".")
    elif "," in s:
        # Assume ',' is decimal (e.g., 261,96)
        s = s.replace(",", ".")
    try:
        return float(s)
    except Exception:
        return None  # fallback: treat as missing

def parse_int(s):
    # Parse integers reliably even if given like "1.000" or "1,000"
    f = parse_float(s)
    if f is None:
        return None
    try:
        return int(round(f))
    except Exception:
        return None

# Paths (relative-safe: supports running from project root or ./notebook)
cwd = Path.cwd()
data_dir_candidates = [cwd / "data", cwd.parent / "data"]
data_dir = next((p for p in data_dir_candidates if p.exists()), data_dir_candidates[0])
db_path = data_dir / "Retail_supply_chain.db"
expected_csv_name = "Retail_supply_chain - Retails Order Full Dataset.csv"
csv_path = data_dir / expected_csv_name
if not csv_path.exists():
    candidates = list(data_dir.rglob("Retail_supply_chain*Full*Dataset*.csv"))
    if not candidates:
        raise FileNotFoundError(f"CSV not found at {csv_path}; place the file in the 'data/' folder.")
    csv_path = candidates[0]

# Schema
schema_sql = """
DROP TABLE IF EXISTS stg_orders;
CREATE TABLE stg_orders (
    row_id               INTEGER,
    order_id             TEXT,
    order_date           TEXT,   -- ISO 8601 YYYY-MM-DD
    ship_date            TEXT,   -- ISO 8601 YYYY-MM-DD
    ship_mode            TEXT,
    customer_id          TEXT,
    customer_name        TEXT,
    segment              TEXT,
    country              TEXT,
    city                 TEXT,
    state                TEXT,
    postal_code          INTEGER,
    region               TEXT,
    retail_sales_people  TEXT,
    product_id           TEXT,
    category             TEXT,
    sub_category         TEXT,
    product_name         TEXT,
    returned             TEXT,
    sales                REAL,
    quantity             INTEGER,
    discount             REAL,
    profit               REAL
);
CREATE INDEX IF NOT EXISTS idx_stg_orders_order_date ON stg_orders(order_date);
CREATE INDEX IF NOT EXISTS idx_stg_orders_state ON stg_orders(state);
CREATE INDEX IF NOT EXISTS idx_stg_orders_region ON stg_orders(region);
CREATE INDEX IF NOT EXISTS idx_stg_orders_category ON stg_orders(category, sub_category);
"""

# Load
conn = sqlite3.connect(db_path.as_posix())
cur = conn.cursor()
cur.executescript(schema_sql)
conn.commit()

with open(csv_path, "r", encoding="utf-8-sig", newline="") as f:
    reader = csv.DictReader(f)
    rows = []
    for r in reader:
        rows.append((
            parse_int(r.get("Row ID")),
            (r.get("Order ID") or "").strip() or None,
            to_iso_date(r.get("Order Date")),
            to_iso_date(r.get("Ship Date")),
            (r.get("Ship Mode") or "").strip() or None,
            (r.get("Customer ID") or "").strip() or None,
            (r.get("Customer Name") or "").strip() or None,
            (r.get("Segment") or "").strip() or None,
            (r.get("Country") or "").strip() or None,
            (r.get("City") or "").strip() or None,
            (r.get("State") or "").strip() or None,
            parse_int(r.get("Postal Code")),
            (r.get("Region") or "").strip() or None,
            (r.get("Retail Sales People") or "").strip() or None,
            (r.get("Product ID") or "").strip() or None,
            (r.get("Category") or "").strip() or None,
            (r.get("Sub-Category") or "").strip() or None,
            (r.get("Product Name") or "").strip() or None,
            (r.get("Returned") or "").strip() or None,
            parse_float(r.get("Sales")),
            parse_int(r.get("Quantity")),
            parse_float(r.get("Discount")),
            parse_float(r.get("Profit"))
        ))

insert_sql = """
INSERT INTO stg_orders (
    row_id, order_id, order_date, ship_date, ship_mode, customer_id, customer_name,
    segment, country, city, state, postal_code, region, retail_sales_people,
    product_id, category, sub_category, product_name, returned, sales, quantity, discount, profit
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
cur.executemany(insert_sql, rows)
conn.commit()

total_rows = cur.execute("SELECT COUNT(*) FROM stg_orders;").fetchone()[0]
print(f"OK: loaded {total_rows} rows into stg_orders")

conn.close()



OK: loaded 9994 rows into stg_orders


In [10]:
# Phase 2: build Rsc_cleaned from stg_orders (no new files; aligns with your last setup)

# Resolve DB path (supports running from project root or ./notebook)
cwd = Path.cwd()
data_dir_candidates = [cwd / "data", cwd.parent / "data"]
data_dir = next((p for p in data_dir_candidates if p.exists()), data_dir_candidates[0])
db_path = data_dir / "Retail_supply_chain.db"

# Cleaning / standardization SQL
cleaning_sql = """
-- 1) Deduplicate on row_id (keep most recent by order_date; fallback by rowid)
DROP TABLE IF EXISTS stg_orders_dedup;
CREATE TABLE stg_orders_dedup AS
WITH ranked AS (
  SELECT
    s.*,
    ROW_NUMBER() OVER (
      PARTITION BY s.row_id
      ORDER BY s.order_date DESC, s.rowid ASC
    ) AS rn
  FROM stg_orders s
)
SELECT
  row_id, order_id, order_date, ship_date, ship_mode, customer_id, customer_name,
  segment, country, city, state, postal_code, region, retail_sales_people,
  product_id, category, sub_category, product_name, returned, sales, quantity, discount, profit
FROM ranked
WHERE rn = 1;

CREATE INDEX IF NOT EXISTS idx_dedup_order_date ON stg_orders_dedup(order_date);
CREATE INDEX IF NOT EXISTS idx_dedup_region_state ON stg_orders_dedup(region, state);

-- 2) Normalization view (text trimming, casing, returned standardization, numeric sanitization)
DROP VIEW IF EXISTS clean_orders;
CREATE VIEW clean_orders AS
WITH base AS (
  SELECT
    CAST(row_id AS INTEGER)                  AS row_id,
    TRIM(order_id)                           AS order_id,
    TRIM(order_date)                         AS order_date,   -- ISO TEXT yyyy-mm-dd
    TRIM(ship_date)                          AS ship_date,    -- ISO TEXT yyyy-mm-dd
    TRIM(ship_mode)                          AS ship_mode,
    TRIM(customer_id)                        AS customer_id,
    TRIM(segment)                            AS segment,
    UPPER(TRIM(country))                     AS country,
    TRIM(city)                               AS city,
    TRIM(state)                              AS state,
    CAST(postal_code AS INTEGER)             AS postal_code,
    UPPER(TRIM(region))                      AS region,
    TRIM(product_id)                         AS product_id,
    UPPER(TRIM(category))                    AS category,
    UPPER(TRIM(sub_category))                AS sub_category,
    TRIM(product_name)                       AS product_name,
    CASE
      WHEN returned IS NULL OR TRIM(returned) = '' THEN 'NO'
      WHEN UPPER(TRIM(returned)) IN ('Y','YES','TRUE','T','1','RETURNED') THEN 'YES'
      WHEN UPPER(TRIM(returned)) IN ('N','NO','FALSE','F','0','NOT RETURNED','NONE','NOT') THEN 'NO'
      ELSE UPPER(TRIM(returned))
    END                                      AS returned,
    CAST(sales    AS REAL)                   AS sales,
    CASE WHEN CAST(quantity AS REAL) < 0 THEN 0 ELSE CAST(quantity AS INTEGER) END AS quantity,
    CASE
      WHEN discount IS NULL THEN NULL
      WHEN discount > 1.0 AND discount <= 100.0 THEN ROUND(discount/100.0, 4)
      WHEN discount < 0.0 THEN 0.0
      ELSE CAST(discount AS REAL)
    END                                      AS discount,
    CAST(profit   AS REAL)                   AS profit
  FROM stg_orders_dedup
)
SELECT
  row_id, order_id, order_date, ship_date, ship_mode, customer_id, segment, country,
  city, state, postal_code, region, product_id, category, sub_category, product_name,
  returned, sales, quantity, discount, profit
FROM base;

-- 3) Materialize for BI consumption as Rsc_cleaned (PII excluded)
DROP TABLE IF EXISTS Rsc_cleaned;
CREATE TABLE Rsc_cleaned AS
SELECT * FROM clean_orders;

-- 4) Indexes on Rsc_cleaned
CREATE UNIQUE INDEX IF NOT EXISTS ux_rsc_row_id ON Rsc_cleaned(row_id);
CREATE INDEX IF NOT EXISTS idx_rsc_order_date ON Rsc_cleaned(order_date);
CREATE INDEX IF NOT EXISTS idx_rsc_region_state ON Rsc_cleaned(region, state);
CREATE INDEX IF NOT EXISTS idx_rsc_category_sub ON Rsc_cleaned(category, sub_category);
"""

# Execute cleaning
with sqlite3.connect(db_path.as_posix()) as conn:
    conn.executescript(cleaning_sql)
print("OK: cleaning executed -> table 'Rsc_cleaned' created/refreshed.")

# QA checks (concise)
with sqlite3.connect(db_path.as_posix()) as conn:
    cur = conn.cursor()
    counts = cur.execute("""
        SELECT
          (SELECT COUNT(*) FROM stg_orders),
          (SELECT COUNT(*) FROM stg_orders_dedup),
          (SELECT COUNT(*) FROM Rsc_cleaned)
    """).fetchone()
    uniq = cur.execute("""
        SELECT COUNT(*) AS total, COUNT(DISTINCT row_id) AS distinct_row_id
        FROM Rsc_cleaned
    """).fetchone()
    returned = cur.execute("""
        SELECT returned, COUNT(*) FROM Rsc_cleaned
        GROUP BY returned ORDER BY 2 DESC
    """).fetchall()
    num_sanity = cur.execute("""
        SELECT
          SUM(CASE WHEN discount < 0 OR discount > 1 THEN 1 ELSE 0 END),
          SUM(CASE WHEN quantity < 0 THEN 1 ELSE 0 END)
        FROM Rsc_cleaned
    """).fetchone()
    date_range = cur.execute("""
        SELECT MIN(order_date), MAX(order_date) FROM Rsc_cleaned
    """).fetchone()

print("Counts (stg_raw, stg_dedup, rsc_cleaned):", counts)
print("Row ID uniqueness (total, distinct):", uniq)
print("Returned distribution:", returned)
print("Numeric sanity (bad_discount, bad_qty):", num_sanity)
print("Order date range (min, max):", date_range)


OK: cleaning executed -> table 'Rsc_cleaned' created/refreshed.
Counts (stg_raw, stg_dedup, rsc_cleaned): (9994, 9994, 9994)
Row ID uniqueness (total, distinct): (9994, 9994)
Returned distribution: [('NO', 9194), ('YES', 800)]
Numeric sanity (bad_discount, bad_qty): (0, 0)
Order date range (min, max): ('2014-01-02', '2017-12-30')


In [11]:
# Build dimensional views/tables from Rsc_cleaned for downstream KPI queries (SQLite)

# Resolve DB path (supports running from project root or ./notebook)
cwd = Path.cwd()
data_dir_candidates = [cwd / "data", cwd.parent / "data"]
data_dir = next((p for p in data_dir_candidates if p.exists()), data_dir_candidates[0])
db_path = data_dir / "Retail_supply_chain.db"

model_sql = """
-- =========================
-- Star-lite modeling (SQLite)
-- Source: Rsc_cleaned
-- Creates: dim_date, dim_geo, dim_product (VIEWs) and fct_sales (TABLE)
-- =========================

-- 0) Safety drops
DROP VIEW IF EXISTS dim_date;
DROP VIEW IF EXISTS dim_geo;
DROP VIEW IF EXISTS dim_product;
DROP TABLE IF EXISTS fct_sales;

-- 1) Date dimension (derived from order_date)
CREATE VIEW dim_date AS
WITH base AS (
  SELECT DISTINCT order_date
  FROM Rsc_cleaned
  WHERE order_date IS NOT NULL
)
SELECT
  order_date                                  AS date_key,             -- TEXT YYYY-MM-DD
  SUBSTR(order_date, 1, 4)                    AS year,
  SUBSTR(order_date, 6, 2)                    AS month,
  SUBSTR(order_date, 9, 2)                    AS day,
  (SUBSTR(order_date, 1, 4) || '-' || SUBSTR(order_date, 6, 2)) AS year_month,
  CASE SUBSTR(order_date, 6, 2)
    WHEN '01' THEN 'Q1' WHEN '02' THEN 'Q1' WHEN '03' THEN 'Q1'
    WHEN '04' THEN 'Q2' WHEN '05' THEN 'Q2' WHEN '06' THEN 'Q2'
    WHEN '07' THEN 'Q3' WHEN '08' THEN 'Q3' WHEN '09' THEN 'Q3'
    ELSE 'Q4'
  END AS quarter
FROM base;

-- 2) Geography dimension (Region/State/City)
CREATE VIEW dim_geo AS
SELECT DISTINCT
  UPPER(TRIM(region)) AS region,
  TRIM(state)         AS state,
  TRIM(city)          AS city
FROM Rsc_cleaned
WHERE region IS NOT NULL
  AND state  IS NOT NULL;

-- 3) Product dimension (Category/Sub-Category/Product)
CREATE VIEW dim_product AS
SELECT DISTINCT
  TRIM(product_id)      AS product_id,
  UPPER(TRIM(category)) AS category,
  UPPER(TRIM(sub_category)) AS sub_category,
  TRIM(product_name)    AS product_name
FROM Rsc_cleaned
WHERE product_id IS NOT NULL;

-- 4) Fact table (one row per order-product)
CREATE TABLE fct_sales AS
SELECT
  r.row_id                       AS row_id,         -- surrogate PK (unique)
  r.order_id                     AS order_id,
  r.order_date                   AS date_key,       -- join to dim_date.date_key
  r.region                       AS region,         -- join to dim_geo.region
  r.state                        AS state,          -- join to dim_geo.state
  r.city                         AS city,           -- optional join on city
  r.product_id                   AS product_id,     -- join to dim_product.product_id
  r.category                     AS category,       -- redundant for ease in BI
  r.sub_category                 AS sub_category,   -- redundant for ease in BI
  r.ship_mode                    AS ship_mode,
  r.returned                     AS returned,       -- 'YES'/'NO'
  r.sales                        AS sales,
  r.quantity                     AS quantity,
  r.discount                     AS discount,       -- 0..1
  r.profit                       AS profit
FROM Rsc_cleaned r;

-- 5) Indexes for BI performance
CREATE UNIQUE INDEX IF NOT EXISTS ux_fct_row_id ON fct_sales(row_id);
CREATE INDEX IF NOT EXISTS idx_fct_date ON fct_sales(date_key);
CREATE INDEX IF NOT EXISTS idx_fct_region_state ON fct_sales(region, state);
CREATE INDEX IF NOT EXISTS idx_fct_product ON fct_sales(product_id);
CREATE INDEX IF NOT EXISTS idx_fct_category_sub ON fct_sales(category, sub_category);

-- 6) Lightweight integrity checks (counts should match)
"""

# Execute modeling
with sqlite3.connect(db_path.as_posix()) as conn:
    conn.executescript(model_sql)
print("OK: modeling created (dim_date, dim_geo, dim_product, fct_sales).")

# Quick QA
with sqlite3.connect(db_path.as_posix()) as conn:
    cur = conn.cursor()
    cnts = cur.execute("""
        SELECT
          (SELECT COUNT(*) FROM Rsc_cleaned),
          (SELECT COUNT(*) FROM fct_sales)
    """).fetchone()
    geo_keys = cur.execute("""
        SELECT
          (SELECT COUNT(DISTINCT region) FROM dim_geo),
          (SELECT COUNT(DISTINCT state)  FROM dim_geo),
          (SELECT COUNT(DISTINCT city)   FROM dim_geo)
    """).fetchone()
    prod_keys = cur.execute("""
        SELECT
          (SELECT COUNT(DISTINCT product_id)  FROM dim_product),
          (SELECT COUNT(DISTINCT category)    FROM dim_product),
          (SELECT COUNT(DISTINCT sub_category) FROM dim_product)
    """).fetchone()
    date_minmax = cur.execute("""
        SELECT MIN(date_key), MAX(date_key) FROM dim_date
    """).fetchone()
    null_checks = cur.execute("""
        SELECT
          SUM(CASE WHEN date_key IS NULL THEN 1 ELSE 0 END),
          SUM(CASE WHEN region   IS NULL THEN 1 ELSE 0 END),
          SUM(CASE WHEN state    IS NULL THEN 1 ELSE 0 END),
          SUM(CASE WHEN product_id IS NULL THEN 1 ELSE 0 END)
        FROM fct_sales
    """).fetchone()

print("Counts (Rsc_cleaned, fct_sales):", cnts)               # expect equal (row-per-row)
print("dim_geo distinct (region, state, city):", geo_keys)    # sanity for mapping
print("dim_product distinct (product_id, category, subcat):", prod_keys)
print("dim_date range (min, max):", date_minmax)
print("fct_sales NULLs (date_key, region, state, product_id):", null_checks)


OK: modeling created (dim_date, dim_geo, dim_product, fct_sales).
Counts (Rsc_cleaned, fct_sales): (9994, 9994)
dim_geo distinct (region, state, city): (4, 49, 531)
dim_product distinct (product_id, category, subcat): (1862, 3, 17)
dim_date range (min, max): ('2014-01-02', '2017-12-30')
fct_sales NULLs (date_key, region, state, product_id): (0, 0, 0, 0)
