-
Notifications
You must be signed in to change notification settings - Fork 0
/
biqueries.sql
299 lines (249 loc) · 8.67 KB
/
biqueries.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
-- Test Database: PostGres
SELECT last_name, count(1) AS pops FROM actor
GROUP BY last_name
ORDER BY pops desc;
SELECT * FROM actor LIMIT 2;
-- -- Use EXTRACT function
SELECT p.payment_id, p.customer_id, p.amount, p.payment_date
FROM payment p
WHERE EXTRACT(month FROM p.payment_date) = 2
AND p.amount > 2
ORDER BY p.payment_id DESC, p.amount ASC;
-- -- Trim date column using ::
SELECT p.payment_date::date, COUNT(*)
FROM payment p
GROUP BY 1 --column 1 of results
ORDER BY 2 DESC;
--SELECT p.customer_id, SUM(p.amount)
SELECT p.customer_id, ARRAY_AGG(p.payment_date) --get result in array, ready for data scientists
FROM payment p
GROUP BY 1
ORDER BY 2 DESC
-- Generate Sample data
-- LEFT JOIN example
SELECT gs::date, gs2::date
FROM generate_series('2018-04-01', current_date::date, INTERVAL '1 Day' ) gs
LEFT JOIN generate_series('2018-04-10', current_date::date, INTERVAL '1 Day' ) gs2
ON gs::date = gs2::date
SELECT gs::date, COUNT(*)
FROM generate_series('2007-02-01', '2007-02-28', INTERVAL '1 Day' ) gs
LEFT JOIN payment p
ON p.payment_date::date = gs::date
GROUP BY 1
HAVING COUNT(*) = 1;
-- Has all inventory even been rented
SELECT f.film_id, f.title,
i.store_id,i.inventory_id,
COUNT(distinct r.rental_id) as rentals
FROM film f
LEFT JOIN inventory i ON i.film_id = f.film_id
LEFT JOIN rental r ON r.inventory_id = i.inventory_id
GROUP BY 1, 2, 3, 4
ORDER BY 3 NULLS FIRST;
-- Finding a customer's first rental and various attributes about it
-- SELF JOIN
SELECT r.customer_id, MIN(r.rental_id) AS first_order_id, (
SELECT r2.rental_date
FROM rental r2
WHERE r2.rental_id = MIN(r.rental_id)
)::date first_order_date
FROM rental r
GROUP BY 1
ORDER BY 1;
-- How many Customers purchased from multiple stores
SELECT t.customer_id, COUNT(*) FROM
(SELECT DISTINCT r.customer_id, s.store_id
FROM rental r
LEFT JOIN staff s ON s.staff_id = r.staff_id
ORDER BY 1) t
GROUP BY 1;
--Common Table Express (CTE) Examples
WITH base_table AS (
SELECT DISTINCT r.customer_id, s.store_id
FROM rental r
LEFT JOIN staff s ON s.staff_id = r.staff_id
ORDER BY 1
)
SELECT bt.customer_id, COUNT(*) FROM base_table bt
GROUP BY 1
ORDER BY 1;
-- -- JOIN gotchas, sometimes, if using LEFT JOINS and NULL matters, put the filer on the JOIN itself
SELECT zebra::date, 'zebra', p.*
FROM generate_series('2007-02-01', '2007-02-28', INTERVAL '1 Day' ) zebra
LEFT JOIN payment p ON p.payment_date::date = zebra::date and p.staff_id = 2
ORDER BY 3 NULLS FIRST;
WITH base_table1 AS (
SELECT zebra::date, 'zebra', p.*
FROM generate_series('2007-02-01', '2007-02-28', INTERVAL '1 Day' ) zebra
LEFT JOIN payment p ON p.payment_date::date = zebra::date and p.staff_id = 2
ORDER BY 3 NULLS FIRST
)
-- Chaining multiple conditions where OR is involved
-- from rental_id > 1400 and payment hour is between 8am and noon OR 2pm to 3pm
SELECT *
FROM base_table1 bt
WHERE bt.rental_id > 1400
AND EXTRACT(HOUR FROM bt.payment_date) IN (8,9,10,11,12,14)
ORDER BY 6; --rental_id is in column 6 bro!
-- WHERE vs HAVING
-- Return Customers whose first order was on a weekend and worth over 5 and who's spent at least 100 total
-- Note: Sunday =0, Saturday = 6
-- CLV: Customer Lifetime Value
SELECT p.*, EXTRACT(dow FROM p.payment_date) AS day_week,
(
SELECT SUM(p3.amount)
FROM payment p3
WHERE p3.customer_id = p.customer_id
) as CLV
FROM payment p
WHERE p.payment_id = (
SELECT MIN(p2.payment_id)
FROM payment p2
WHERE p2.customer_id = p.customer_id
)
AND EXTRACT(dow FROM p.payment_date) IN (0,6)
AND p.amount > 5
GROUP BY 1
HAVING
(
SELECT SUM(p3.amount)
FROM payment p3
WHERE p3.customer_id = p.customer_id
) > 100;
-- BigQuery Analytics
-- Qwiklabs Sessions
-- Find Duplicate values
-- NUGGETS:
-- In your own datasets, even if you have a unique key, it is still beneficial to
-- confirm the uniqueness of the rows with COUNT, GROUP BY, and HAVING before you
-- begin your analysis.
#standardSQL
SELECT COUNT(*) as num_duplicate_rows, * FROM
`data-to-insights.ecommerce.all_sessions_raw`
GROUP BY fullVisitorId,
channelGrouping,
time,
country,
city,
totalTransactionRevenue,
transactions,
timeOnSite,
pageviews,
sessionQualityDim,
date,
visitId,
type,
productRefundAmount,
productQuantity,
productPrice, productRevenue, productSKU, v2ProductName, v2ProductCategory,
productVariant, currencyCode, itemQuantity, itemRevenue, transactionRevenue,
transactionId, pageTitle, searchKeyword, pagePathLevel1, eCommerceAction_type,
eCommerceAction_step, eCommerceAction_option
HAVING num_duplicate_rows > 1;
-- Confirm that no duplicates exist
#standardSQL
# schema: https://support.google.com/analytics/answer/3437719?hl=en
SELECT
fullVisitorId, # the unique visitor ID
visitId, # a visitor can have multiple visits
date, # session date stored as string YYYYMMDD
time, # time of the individual site hit (can be 0 to many per visitor session)
v2ProductName, # not unique since a product can have variants like Color
productSKU, # unique for each product
type, # a visitor can visit Pages and/or can trigger Events (even at the same time)
eCommerceAction_type, # maps to ‘add to cart', ‘completed checkout'
eCommerceAction_step,
eCommerceAction_option,
transactionRevenue, # revenue of the order
transactionId, # unique identifier for revenue bearing transaction
COUNT(*) as row_count
FROM
`data-to-insights.ecommerce.all_sessions`
GROUP BY 1,2,3 ,4, 5, 6, 7, 8, 9, 10,11,12
HAVING row_count > 1 # find duplicates
-- Note: In SQL, you can GROUP BY or ORDER BY the index of the column like using "GROUP BY 1" instead of "GROUP BY fullVisitorId"
-- determines the total views by counting product_views and the number of unique visitors by counting fullVisitorID
#standardSQL
SELECT
COUNT(*) AS product_views,
COUNT(DISTINCT fullVisitorId) AS unique_visitors
FROM `data-to-insights.ecommerce.all_sessions`;
-- Now write a query that shows total unique visitors(fullVisitorID) by the referring site (channelGrouping):
#standardSQL
SELECT
COUNT(DISTINCT fullVisitorId) AS unique_visitors,
channelGrouping
FROM `data-to-insights.ecommerce.all_sessions`
GROUP BY channelGrouping
ORDER BY channelGrouping DESC;
-- list all the unique product names (v2ProductName) alphabetically:
#standardSQL
SELECT
(v2ProductName) AS ProductName
FROM `data-to-insights.ecommerce.all_sessions`
GROUP BY ProductName
ORDER BY ProductName
-- list the five products with the most views (product_views) from all visitors (include people who have viewed the same product more than once)
#standardSQL
SELECT
COUNT(*) AS product_views,
(v2ProductName) AS ProductName
FROM `data-to-insights.ecommerce.all_sessions`
WHERE type = 'PAGE'
GROUP BY v2ProductName
ORDER BY product_views DESC
LIMIT 5;
-- Now refine the query to no longer double-count product views for visitors who have viewed a product many times. Each distinct product view should only count once per visitor.
WITH unique_product_views_by_person AS (
-- find each unique product viewed by each visitor
SELECT
fullVisitorId,
(v2ProductName) AS ProductName
FROM `data-to-insights.ecommerce.all_sessions`
WHERE type = 'PAGE'
GROUP BY fullVisitorId, v2ProductName )
-- aggregate the top viewed products and sort them
SELECT
COUNT(*) AS unique_view_count,
ProductName
FROM unique_product_views_by_person
GROUP BY ProductName
ORDER BY unique_view_count DESC
LIMIT 5
-- expand your previous query to include the total number of distinct products ordered and the total number of total units ordered (productQuantity)
#standardSQL
SELECT
COUNT(*) AS product_views,
COUNT(productQuantity) AS orders,
SUM(productQuantity) AS quantity_product_ordered,
v2ProductName
FROM `data-to-insights.ecommerce.all_sessions`
WHERE type = 'PAGE'
GROUP BY v2ProductName
ORDER BY product_views DESC
LIMIT 5;
#standardSQL
SELECT
COUNT(*) AS product_views,
COUNT(productQuantity) AS orders,
SUM(productQuantity) AS quantity_product_ordered,
SUM(productQuantity) / COUNT(productQuantity) AS avg_per_order,
(v2ProductName) AS ProductName
FROM `data-to-insights.ecommerce.all_sessions`
WHERE type = 'PAGE'
GROUP BY v2ProductName
ORDER BY product_views DESC
LIMIT 5;
--Expand the query to include the average amount of product per order (total number of units ordered/total number of orders, or SUM(productQuantity)/COUNT(productQuantity)).
#standardSQL
SELECT
COUNT(*) AS product_views,
COUNT(productQuantity) AS orders,
SUM(productQuantity) AS quantity_product_ordered,
SUM(productQuantity) / COUNT(productQuantity) AS avg_per_order,
(v2ProductName) AS ProductName
FROM `data-to-insights.ecommerce.all_sessions`
WHERE type = 'PAGE'
GROUP BY v2ProductName
ORDER BY product_views DESC
LIMIT 5;