**AI-Assisted Hypotheses**



1.   The number of monthly active customers has shown consistent growth over time, with higher engagement during promotional or seasonal periods. We expect continued month-over-month increases driven by repeat buyers.
2.   Average revenue per active customer has increased, reflecting improved conversion rates and higher average order values.
3. Returning customers generate a disproportionately higher share of total revenue.
4. The female demographic purchasing jeans represents a key growth segment with stronger repeat engagement and higher lifetime value compared to other apparel categories.
5. The American and United Kingdom demographics will contribute the most to revenue



Importing the data, creating schema.

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
import os

project_id = "tw-467-assignment-1"  # your project
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
client = bigquery.Client(project=project_id)
print(f"✅ Connected to project: {project_id}")

source_dataset = "bigquery-public-data.thelook_ecommerce"
target_dataset = f"{project_id}.thelook_ecommerce"

dataset = bigquery.Dataset(target_dataset)
dataset.location = "US"
client.create_dataset(dataset, exists_ok=True)
print(f"✅ Dataset created or already exists: {target_dataset}")

tables = list(client.list_tables(source_dataset))
print(f"📋 Found {len(tables)} tables in {source_dataset}")

for table in tables:
    src_table = f"{source_dataset}.{table.table_id}"
    dest_table = f"{target_dataset}.{table.table_id}"
    print(f"📦 Copying {src_table} → {dest_table} ...")

    query = f"""
    CREATE OR REPLACE TABLE `{dest_table}` AS
    SELECT * FROM `{src_table}`;
    """
    client.query(query).result()
    print(f"✅ Copied: {dest_table}")

print("🎉 All tables copied successfully!")

✅ Connected to project: tw-467-assignment-1
✅ Dataset created or already exists: tw-467-assignment-1.thelook_ecommerce
📋 Found 8 tables in bigquery-public-data.thelook_ecommerce
📦 Copying bigquery-public-data.thelook_ecommerce.distribution_centers → tw-467-assignment-1.thelook_ecommerce.distribution_centers ...
✅ Copied: tw-467-assignment-1.thelook_ecommerce.distribution_centers
📦 Copying bigquery-public-data.thelook_ecommerce.events → tw-467-assignment-1.thelook_ecommerce.events ...
✅ Copied: tw-467-assignment-1.thelook_ecommerce.events
📦 Copying bigquery-public-data.thelook_ecommerce.inventory_items → tw-467-assignment-1.thelook_ecommerce.inventory_items ...
✅ Copied: tw-467-assignment-1.thelook_ecommerce.inventory_items
📦 Copying bigquery-public-data.thelook_ecommerce.order_items → tw-467-assignment-1.thelook_ecommerce.order_items ...
✅ Copied: tw-467-assignment-1.thelook_ecommerce.order_items
📦 Copying bigquery-public-data.thelook_ecommerce.orders → tw-467-assignment-1.thelook_ecom

BadRequest: 400 Table bigquery-public-data.thelook_ecommerce.thelook_ecommerce-table does not have a schema.; reason: invalid, message: Table bigquery-public-data.thelook_ecommerce.thelook_ecommerce-table does not have a schema.

Location: US
Job ID: 42f8c1db-a6d0-4979-bc9e-2405029c9776


# KPI 1: Customer Retention rate

This KPI calculates wether or not the ecommerce company is retaining their customers on a monthly basis. This will help tell us how many of the customers are loyal and recurring.

In [None]:
%%bigquery
WITH customer_activity AS (
  SELECT
    user_id,
    FORMAT_DATE('%Y-%m', DATE(created_at)) AS month
  FROM `tw-467-assignment-1.thelook_ecommerce.orders`
  WHERE user_id IS NOT NULL AND created_at IS NOT NULL
  GROUP BY user_id, month
),


previous_month AS (
  SELECT
    user_id,
    FORMAT_DATE('%Y-%m', DATE_ADD(DATE(CONCAT(month, '-01')), INTERVAL 1 MONTH)) AS next_month
  FROM customer_activity
),


retention AS (
  SELECT
    c.month,
    COUNT(DISTINCT c.user_id) AS active_customers,
    COUNT(DISTINCT p.user_id) AS retained_customers
  FROM customer_activity c
  LEFT JOIN previous_month p
  ON c.user_id = p.user_id AND c.month = p.next_month
  GROUP BY c.month
)

SELECT
  month,
  ROUND(SAFE_DIVIDE(retained_customers, active_customers) * 100, 2) AS retention_rate
FROM retention
ORDER BY month;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,month,retention_rate
0,2019-01,0.00
1,2019-02,0.00
2,2019-03,0.00
3,2019-04,0.00
4,2019-05,4.05
...,...,...
77,2025-06,6.71
78,2025-07,7.45
79,2025-08,8.01
80,2025-09,9.76


# KPI 2: Revenue per Active Customer

We also will calculate the MoM Revenue per Active Customer for this KPI. This will help us see how much each active customers revenue increases month over month and see if there is a large increase in any specific month in revenue. We found this using the Lag function

In [None]:
%%bigquery
WITH monthly_metrics AS (
  SELECT
    FORMAT_DATE('%Y-%m', DATE(created_at)) AS month,
    COUNT(DISTINCT user_id) AS active_customers,
    SUM(sale_price) AS total_revenue
  FROM `tw-467-assignment-1.thelook_ecommerce.order_items`
  WHERE user_id IS NOT NULL AND created_at IS NOT NULL
  GROUP BY month
),
rpc AS (
  SELECT
    month,
    ROUND(SAFE_DIVIDE(total_revenue, active_customers), 2) AS revenue_per_customer,
    LAG(ROUND(SAFE_DIVIDE(total_revenue, active_customers), 2)) OVER (ORDER BY month) AS prev_rpc,
    ROUND(
      SAFE_DIVIDE(
        (ROUND(SAFE_DIVIDE(total_revenue, active_customers), 2) -
         LAG(ROUND(SAFE_DIVIDE(total_revenue, active_customers), 2)) OVER (ORDER BY month)),
         LAG(ROUND(SAFE_DIVIDE(total_revenue, active_customers), 2)) OVER (ORDER BY month)
      ) * 100, 2
    ) AS mom_growth_pct
  FROM monthly_metrics
)
SELECT *
FROM rpc
ORDER BY month;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,month,revenue_per_customer,prev_rpc,mom_growth_pct
0,2019-01,86.24,,
1,2019-02,86.84,86.24,0.70
2,2019-03,88.47,86.84,1.88
3,2019-04,85.18,88.47,-3.72
4,2019-05,95.86,85.18,12.54
...,...,...,...,...
77,2025-06,89.98,86.35,4.20
78,2025-07,88.11,89.98,-2.08
79,2025-08,91.39,88.11,3.72
80,2025-09,93.12,91.39,1.89


# KPI 3: Monthly Active Customers

This KPI calculates the total amount of active customers per month. This will give more insight into the busy time of year and when the company should increse or decrease stock.

In [None]:
%%bigquery --project tw-467-assignment-1
-- KPI: Monthly Active Customers (unique purchasers per month)
SELECT
  FORMAT_DATE('%Y-%m', DATE(created_at)) AS month,
  COUNT(DISTINCT user_id) AS monthly_active_customers
FROM `tw-467-assignment-1.thelook_ecommerce.orders`
WHERE user_id IS NOT NULL
  AND created_at IS NOT NULL
GROUP BY month
ORDER BY month;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,month,monthly_active_customers
0,2019-01,12
1,2019-02,19
2,2019-03,52
3,2019-04,59
4,2019-05,74
...,...,...
77,2025-06,4322
78,2025-07,4886
79,2025-08,5406
80,2025-09,6003


# Product Analysis: Jeans purchased by Female customers aged 24-35

*Prompt: Create a SQL query that finds insights on a specific product based on a specific demographic. Added in specifics after for more direct insights*

This block of code provides some insight into orders that inlcuded Jeans, were purchased by Females aged 24-35. As we can see there are 366 total orders that fit this criteria, with 357 uique customers, the total revenue of these orders is just over 36k. The average order value was about 98 dollars and the revenue per customer was about 101 dollars. This knowledge could help the company to know to push advertising jeans to women in that demographic as if they purchase them they are likely to make other purchases on top of it

In [None]:
%%bigquery --project tw-467-assignment-1
WITH base AS (
  SELECT
    o.order_id,
    o.user_id,
    DATE(o.created_at) AS order_date,
    u.country,
    u.state,
    oi.sale_price,
    p.retail_price,
    SAFE_DIVIDE(p.retail_price - oi.sale_price, p.retail_price) AS discount_pct
  FROM `tw-467-assignment-1.thelook_ecommerce.order_items` AS oi
  JOIN `tw-467-assignment-1.thelook_ecommerce.orders` AS o
    ON oi.order_id = o.order_id
  JOIN `tw-467-assignment-1.thelook_ecommerce.products` AS p
    ON oi.product_id = p.id
  JOIN `tw-467-assignment-1.thelook_ecommerce.users` AS u
    ON o.user_id = u.id
  WHERE p.category = 'Jeans'             -- ✅ valid category
    AND u.gender = 'F'
    AND u.age BETWEEN 25 AND 34
    AND o.created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 365 DAY)
)
SELECT
  COUNT(DISTINCT order_id) AS total_orders,
  COUNT(DISTINCT user_id) AS unique_customers,
  ROUND(SUM(sale_price), 2) AS total_revenue,
  ROUND(SUM(sale_price) / COUNT(DISTINCT order_id), 2) AS avg_order_value,
  ROUND(SUM(sale_price) / COUNT(DISTINCT user_id), 2) AS revenue_per_customer
FROM base;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_orders,unique_customers,total_revenue,avg_order_value,revenue_per_customer
0,472,455,48148.33,102.01,105.82


The following checks the original demographic of 24-35 compared to other demographics to see if the age 24-35 should be where we are focusing for Jeans. It appears that 35-44 and 55+ both have higher revenue as well as unique customers.

In [None]:
%%bigquery
SELECT
  CASE
    WHEN u.age BETWEEN 18 AND 24 THEN '18–24'
    WHEN u.age BETWEEN 25 AND 34 THEN '25–34'
    WHEN u.age BETWEEN 35 AND 44 THEN '35–44'
    WHEN u.age BETWEEN 45 AND 54 THEN '45–54'
    ELSE '55+'
  END AS age_band,
  COUNT(DISTINCT o.order_id) AS orders,
  COUNT(DISTINCT u.id) AS unique_customers,
  ROUND(SUM(oi.sale_price), 2) AS total_revenue,
  ROUND(SUM(oi.sale_price)/COUNT(DISTINCT o.order_id), 2) AS avg_order_value
FROM `tw-467-assignment-1.thelook_ecommerce.order_items` AS oi
JOIN `tw-467-assignment-1.thelook_ecommerce.orders` AS o
  ON oi.order_id = o.order_id
JOIN `tw-467-assignment-1.thelook_ecommerce.products` AS p
  ON oi.product_id = p.id
JOIN `tw-467-assignment-1.thelook_ecommerce.users` AS u
  ON o.user_id = u.id
WHERE p.category = 'Jeans'
  AND o.created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 365 DAY)
GROUP BY age_band
ORDER BY total_revenue DESC;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,age_band,orders,unique_customers,total_revenue,avg_order_value
0,55+,2033,1960,204291.76,100.49
1,25–34,922,895,94034.05,101.99
2,45–54,930,900,92389.63,99.34
3,35–44,880,844,89708.62,101.94
4,18–24,668,648,69036.21,103.35


# Customer Analysis: Top 10 Most Popular Customer Regions

*Prompt: Create a SQL query that finds insights on which regions contribute the most to revenue*

This code sorts the most popular regions based on number of customers. This knowledge could help the company know where to most efficiently advertise.

In [None]:
%%bigquery
WITH region_summary AS (
  SELECT
    u.country,
    u.state,
    COUNT(DISTINCT o.user_id) AS unique_customers,
    ROUND(SUM(oi.sale_price), 2) AS total_revenue,
    ROUND(AVG(oi.sale_price), 2) AS avg_sale_price
  FROM `tw-467-assignment-1.thelook_ecommerce.orders` AS o
  JOIN `tw-467-assignment-1.thelook_ecommerce.users` AS u
    ON o.user_id = u.id
  JOIN `tw-467-assignment-1.thelook_ecommerce.order_items` AS oi
    ON o.order_id = oi.order_id
  WHERE u.country IS NOT NULL
  GROUP BY u.country, u.state
)
SELECT
  country,
  state,
  unique_customers,
  total_revenue,
  avg_sale_price
FROM region_summary
ORDER BY unique_customers DESC
LIMIT 10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,country,state,unique_customers,total_revenue,avg_sale_price
0,China,Guangdong,4312,583984.35,58.98
1,United Kingdom,England,3259,438640.77,59.7
2,United States,California,2930,388670.13,58.59
3,United States,Texas,2025,273729.48,58.3
4,China,Shanghai,1872,253414.99,58.78
5,Brasil,São Paulo,1720,232113.65,59.93
6,China,Beijing,1676,225781.0,59.28
7,China,Zhejiang,1663,209332.77,58.29
8,China,Hebei,1575,218237.68,59.45
9,China,Jiangsu,1509,195689.37,58.82


the following validation instead sorts the regions by average sale price. This could be another strong way to know where to advertise as these are where the highest value customers are.

*Prompt: In responding to the previous query, create a new one that finds another way to analyze a different metric for sales based on region*

In [None]:
%%bigquery
WITH region_summary AS (
  SELECT
    u.country,
    u.state,
    COUNT(DISTINCT o.user_id) AS unique_customers,
    ROUND(SUM(oi.sale_price), 2) AS total_revenue,
    ROUND(AVG(oi.sale_price), 2) AS avg_sale_price
  FROM `tw-467-assignment-1.thelook_ecommerce.orders` AS o
  JOIN `tw-467-assignment-1.thelook_ecommerce.users` AS u
    ON o.user_id = u.id
  JOIN `tw-467-assignment-1.thelook_ecommerce.order_items` AS oi
    ON o.order_id = oi.order_id
  WHERE u.country IS NOT NULL
  GROUP BY u.country, u.state
)
SELECT
  country,
  state,
  unique_customers,
  total_revenue,
  avg_sale_price
FROM region_summary
ORDER BY avg_sale_price DESC
LIMIT 10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,country,state,unique_customers,total_revenue,avg_sale_price
0,Japan,Mie,1,358.0,358.0
1,Japan,Kochi,1,336.95,112.32
2,United States,Vermont,1,109.5,109.5
3,Austria,Vorarlberg,5,755.07,94.38
4,Japan,Kagawa,3,369.2,92.3
5,Japan,Ibaraki,14,2940.55,91.89
6,Japan,Akita,1,521.77,86.96
7,Japan,Iwate,7,2073.16,82.93
8,Australia,Australian Capital Territory,33,7436.78,79.97
9,Japan,Gifu,5,714.1,79.34


In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

query = """
SELECT
  u.country,
  ROUND(SUM(oi.sale_price), 2) AS total_revenue,
  COUNT(DISTINCT o.user_id) AS unique_customers,
  COUNT(DISTINCT o.order_id) AS orders
FROM `bigquery-public-data.thelook_ecommerce.order_items` AS oi
JOIN `bigquery-public-data.thelook_ecommerce.orders` AS o
  ON oi.order_id = o.order_id
JOIN `bigquery-public-data.thelook_ecommerce.products` AS p
  ON oi.product_id = p.id
JOIN `bigquery-public-data.thelook_ecommerce.users` AS u
  ON o.user_id = u.id
WHERE p.category = 'Jeans'
  AND o.created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 365 DAY)
  AND u.country IS NOT NULL
GROUP BY u.country
ORDER BY total_revenue DESC
LIMIT 10
"""

df_regions = client.query(query).to_dataframe()
df_regions

Unnamed: 0,country,total_revenue,unique_customers,orders
0,China,183473.03,1737,1807
1,United States,132144.34,1236,1278
2,Brasil,81391.29,783,801
3,South Korea,30163.25,274,282
4,United Kingdom,24570.06,231,236
5,France,22536.41,231,240
6,Germany,22451.51,211,226
7,Spain,22352.94,221,229
8,Japan,11577.9,123,127
9,Australia,10630.51,117,118


In [None]:
fig_bar = px.bar(
    df_regions,
    x="country",
    y="total_revenue",
    color="total_revenue",
    hover_data=["unique_customers", "orders"],
    text_auto=".2s",
    title="Top 10 Countries by Jeans Revenue (Last 12 Months)"
)

fig_bar.update_layout(
    xaxis_title="Country",
    yaxis_title="Total Revenue ($)",
    template="plotly_dark",
    title_font=dict(size=22)
)
fig_bar.show()