<a href="https://colab.research.google.com/github/silvia-denanni/DI-Bootcamp-nov25/blob/main/Mini_project_1_Brazillian_E_commerce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import os, pandas as pd
from sqlalchemy import create_engine, inspect
from pathlib import Path

DATA_DIR = "/content/olist_manual"
DB_PATH  = "/content/olist.sqlite"
os.makedirs(DATA_DIR, exist_ok=True)

# Upload all 9 CSVs in one go
uploads = files.upload()  # multi-select the 9 CSVs on your computer
for name, data in uploads.items():
    with open(os.path.join(DATA_DIR, name), "wb") as f:
        f.write(data)

# Verify
required = [
    "olist_customers_dataset.csv","olist_sellers_dataset.csv","olist_order_reviews_dataset.csv",
    "olist_order_items_dataset.csv","olist_products_dataset.csv","olist_geolocation_dataset.csv",
    "product_category_name_translation.csv","olist_orders_dataset.csv","olist_order_payments_dataset.csv",
]
missing = [fn for fn in required if not (Path(DATA_DIR)/fn).exists()]
assert not missing, f"Missing files: {missing}"

# Create SQLite
engine = create_engine(f"sqlite:///{DB_PATH}", echo=False)
mapping = {
    "olist_customers_dataset.csv": "olist_customers",
    "olist_sellers_dataset.csv": "olist_sellers",
    "olist_order_reviews_dataset.csv": "olist_order_reviews",
    "olist_order_items_dataset.csv": "olist_order_items",
    "olist_products_dataset.csv": "olist_products",
    "olist_geolocation_dataset.csv": "olist_geolocation",
    "product_category_name_translation.csv": "product_category_name_translation",
    "olist_orders_dataset.csv": "olist_orders",
    "olist_order_payments_dataset.csv": "olist_order_payments",
}
for fname, tname in mapping.items():
    pd.read_csv(Path(DATA_DIR)/fname).to_sql(tname, con=engine, if_exists="replace", index=False)

print("Tables:", inspect(engine).get_table_names())
pd.read_sql_query("SELECT * FROM olist_customers LIMIT 5;", con=engine)


Saving olist_customers_dataset.csv to olist_customers_dataset.csv
Saving olist_geolocation_dataset.csv to olist_geolocation_dataset.csv
Saving olist_order_items_dataset.csv to olist_order_items_dataset.csv
Saving olist_order_payments_dataset.csv to olist_order_payments_dataset.csv
Saving olist_order_reviews_dataset.csv to olist_order_reviews_dataset.csv
Saving olist_orders_dataset.csv to olist_orders_dataset.csv
Saving olist_products_dataset.csv to olist_products_dataset.csv
Saving olist_sellers_dataset.csv to olist_sellers_dataset.csv
Saving product_category_name_translation.csv to product_category_name_translation.csv
Tables: ['olist_customers', 'olist_geolocation', 'olist_order_items', 'olist_order_payments', 'olist_order_reviews', 'olist_orders', 'olist_products', 'olist_sellers', 'product_category_name_translation']


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [None]:
q1 = '''
SELECT
    SUM(review_score_5)                           AS orders_with_review_5,
    COUNT(*)                                      AS total_orders,
    ROUND(100.0 * SUM(review_score_5) / NULLIF(COUNT(*), 0), 2) AS percentage_5_star

FROM (
    SELECT
        o.order_id,
        CASE WHEN MAX(CASE WHEN r.review_score = 5 THEN 1 ELSE 0 END) = 1
             THEN 1 ELSE 0 END AS review_score_5
    FROM olist_orders o
    LEFT JOIN olist_order_reviews r
           ON r.order_id = o.order_id
    WHERE date(o.order_purchase_timestamp) >= date('2018-01-01')
      AND date(o.order_purchase_timestamp) <  date('2018-02-01')
    GROUP BY o.order_id
) per_order;
'''
df_q1 = pd.read_sql_query(q1, con=engine)
print(df_q1.head())

   orders_with_review_5  total_orders  percentage_5_star
0                  4077          7269              56.09


In [None]:
q2 = """
WITH yearly AS (
    SELECT
        strftime('%Y', o.order_purchase_timestamp) AS yr,
        COUNT(*) AS orders,
        COUNT(DISTINCT o.customer_id) AS unique_customers
    FROM olist_orders o
    GROUP BY strftime('%Y', o.order_purchase_timestamp)
)
SELECT
    yr,
    orders,
    unique_customers,
    ROUND(100 * (orders - LAG(orders) OVER (ORDER BY yr))
          / NULLIF(LAG(orders) OVER (ORDER BY yr), 0), 2) AS orders_yoy_pct,

  /* NULLIF(value, 0) turns the divisor into NULL if it’s zero, so we avoid division-by-zero errors.
  If the previous year’s orders = 0, the result of the division will be NULL instead of crashing. */

    ROUND(100 * (unique_customers - LAG(unique_customers) OVER (ORDER BY yr))
          / NULLIF(LAG(unique_customers) OVER (ORDER BY yr), 0), 2) AS customers_yoy_pct
FROM yearly
ORDER BY yr;
"""
df_q2 = pd.read_sql_query(q2, engine)
df_q2


Unnamed: 0,yr,orders,unique_customers,orders_yoy_pct,customers_yoy_pct
0,2016,329,329,,
1,2017,45101,45101,13608.51,13608.51
2,2018,54011,54011,19.76,19.76


In [None]:
sql = '''
SELECT
    month AS month_no,
    CASE
        WHEN a.month = '01' THEN 'Jan'
        WHEN a.month = '02' THEN 'Feb'
        WHEN a.month = '03' THEN 'Mar'
        WHEN a.month = '04' THEN 'Apr'
        WHEN a.month = '05' THEN 'May'
        WHEN a.month = '06' THEN 'Jun'
        WHEN a.month = '07' THEN 'Jul'
        WHEN a.month = '08' THEN 'Aug'
        WHEN a.month = '09' THEN 'Sep'
        WHEN a.month = '10' THEN 'Oct'
        WHEN a.month = '11' THEN 'Nov'
        WHEN a.month = '12' THEN 'Dec'
        ELSE 0
    END AS month,
    SUM(CASE WHEN a.year = '2016' THEN 1 ELSE 0 END) AS Year2016,
    SUM(CASE WHEN a.year = '2017' THEN 1 ELSE 0 END) AS Year2017,
    SUM(CASE WHEN a.year = '2018' THEN 1 ELSE 0 END) AS Year2018
FROM (
    SELECT
        customer_id,
        order_id,
        order_delivered_customer_date,
        order_status,
        strftime('%Y', order_delivered_customer_date) AS Year,
        strftime('%m', order_delivered_customer_date) AS Month
    FROM olist_orders
    WHERE order_status = 'delivered' AND order_delivered_customer_date IS NOT NULL
    GROUP BY customer_id, order_id, order_delivered_customer_date
    ORDER BY order_delivered_customer_date ASC
) a
GROUP BY month
ORDER BY month_no ASC
'''

df_sql = pd.read_sql_query(sql, con=engine)
print(df_sql.head(12))

   month_no month  Year2016  Year2017  Year2018
0        01   Jan         0       283      6597
1        02   Feb         0      1351      5850
2        03   Mar         0      2382      6824
3        04   Apr         0      1849      7850
4        05   May         0      3751      7111
5        06   Jun         0      3223      6829
6        07   Jul         0      3455      5839
7        08   Aug         0      4302      8314
8        09   Sep         0      3965        56
9        10   Oct       205      4494         3
10       11   Nov        58      4670         0
11       12   Dec         4      7205         0


In [None]:
q3 = """
WITH order_values AS (
    SELECT
        oi.order_id,
        SUM(oi.price + oi.freight_value) AS order_value
    FROM olist_order_items oi
    GROUP BY oi.order_id
),
customer_orders AS (
    SELECT
        o.customer_id,
        ov.order_value
    FROM olist_orders o
    JOIN order_values ov ON ov.order_id = o.order_id
)
SELECT
    c.customer_id,
    COUNT(*) AS orders_count,
    ROUND(AVG(order_value), 2) AS avg_order_value,
    ROUND(SUM(order_value), 2) AS total_spent
FROM customer_orders c
GROUP BY c.customer_id
ORDER BY total_spent DESC;  -- optional: see top spenders first
"""
df_q3 = pd.read_sql_query(q3, engine)
df_q3.head(20)  # show a sample


Unnamed: 0,customer_id,orders_count,avg_order_value,total_spent
0,1617b1357756262bfa56ab541c47bc16,1,13664.08,13664.08
1,ec5b2ba62e574342386871631fafd3fc,1,7274.88,7274.88
2,c6e2731c5b391845f6800c97401a43a9,1,6929.31,6929.31
3,f48d464a0baaea338cb25f816991ab1f,1,6922.21,6922.21
4,3fd6777bbce08a352fddd04e4a7cc8f6,1,6726.66,6726.66
5,05455dfa7cd02f13d132aa7a6a9729c6,1,6081.54,6081.54
6,df55c14d1476a9a3467f131269c2477f,1,4950.34,4950.34
7,e0a2412720e9ea4f26c1ac985f6a7358,1,4809.44,4809.44
8,24bbf5fd2f2e1b359ee7de94defc4a15,1,4764.34,4764.34
9,3d979689f636322c62418b6346b1c6d2,1,4681.78,4681.78
