In [4]:
# BigQuery Setup
# Importing Libraries and Credentials
import pandas as pd
import numpy as np
import seaborn as sns
from google.cloud import bigquery
from google.oauth2 import service_account
# ignore warnings
from warnings import filterwarnings
filterwarnings("ignore")


%load_ext google.cloud.bigquery

credentials = service_account.Credentials.from_service_account_file('/Users/ssamilozkan/Desktop/BigQuery/config.json')

project_id = 'dbt-bigquery-setup-369911'
client = bigquery.Client(credentials= credentials, project=project_id)


In [6]:
%%bigquery
SELECT  totrevenue
FROM `bigquery-public-data.irs_990.irs_990_2015`
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,totrevenue
0,9475129863
1,9021585970
2,9890722789
3,1094833976
4,2186337569
5,2086259022
6,1745011054
7,1711501686
8,507813618
9,1229151613


`FORMAT("%'d", totrevenue)`

Function = Performs an Action

Parameters = Inputs you provide

`SELECT FORMAT ("%'d", 1000)`
-> returns '1,000'

**Beware of stylistic formatting in SQL:**

    - Yout output is now treated like a string. This makes math operations on this calculated field more difficult.
    
    - It's best to save stylistic elements for your visualization tool.

In [19]:
%%bigquery
SELECT 
    FORMAT("%'d", totrevenue) AS revenue
FROM `bigquery-public-data.irs_990.irs_990_2015`
ORDER BY totrevenue DESC
LIMIT  10
# It's much easier to read the numbers with commas
# But there may be couple of caveats that come with that

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue
0,45409123226
1,20796549014
2,11091388129
3,10098163008
4,9890722789
5,9475129863
6,9021585970
7,8655129029
8,7523260077
9,6740015230


**Aliases do not exist yet when filtering in WHERE**

In [41]:
%%bigquery
SELECT 
    (totrevenue - totfuncexpns) AS income
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
WHERE income > 0 # Does not exist, will error in WHERE clause
ORDER BY income DESC
LIMIT 10

Executing query with job ID: 31d0f574-fde3-420e-9c03-086a9336b882
Query executing: 0.68s


ERROR:
 400 Unrecognized name: income at [5:7]

Location: US
Job ID: 31d0f574-fde3-420e-9c03-086a9336b882



**Add new fields in SELECT clause to return more data**

In [44]:
# EIN(employer identification number) is a unique identifier for that charity
# is_school is a flag field indicating whether that charity is a school
%%bigquery
SELECT 
    totrevenue AS revenue,
    ein,
    operateschools170cd AS is_school
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
ORDER BY revenue DESC
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue,ein,is_school
0,45409123226,941340523,N
1,20796549014,941105628,N
2,11091388129,900656139,N
3,10098163008,208295721,N
4,9890722789,900424876,N
5,9475129863,390123480,N
6,9021585970,390123480,N
7,8655129029,941196203,N
8,7523260077,912153073,N
9,6740015230,42103580,Y


### Filters, aggregates, and duplicates


In [49]:
%%bigquery
SELECT 
    totrevenue AS revenue,
    ein,
    operateschools170cd AS is_school
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
WHERE 
    operateschools170cd = 'Y'
ORDER BY revenue DESC
LIMIT 10
# Why didn't we write this as is_school = 'Y'?

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue,ein,is_school
0,6740015230,42103580,Y
1,6000839000,231352685,Y
2,5717023246,941156365,Y
3,5569004000,520595110,Y
4,5133788413,135562308,Y
5,4623485966,951642394,Y
6,4560196033,416011702,Y
7,4477633568,60646973,Y
8,4471027733,135598093,Y
9,4368738915,150532082,Y


**Perform calculations over values with aggregation**

In [50]:
%%bigquery
SELECT 
    SUM(totrevenue) AS total_2015_revenue,
    AVG(totrevenue) AS avg_revenue,
    COUNT(ein) AS nonprofits,
    COUNT(DISTINCT ein) AS nonprofits_distinct,
    MAX(noemplyeesw3cnt) AS num_employees
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_2015_revenue,avg_revenue,nonprofits,nonprofits_distinct,num_employees
0,2344355088288,7952843.0,294782,275077,787050


**Embed functions inside of other functions**

In [52]:
%%bigquery
SELECT 
    SUM(totrevenue) AS total_2015_revenue,
    ROUND(AVG(totrevenue),2) AS avg_revenue, ## we can round the average revenue
    COUNT(ein) AS nonprofits,
    COUNT(DISTINCT ein) AS nonprofits_distinct,
    MAX(noemplyeesw3cnt) AS num_employees
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_2015_revenue,avg_revenue,nonprofits,nonprofits_distinct,num_employees
0,2344355088288,7952843.42,294782,275077,787050


In [56]:
%%bigquery
SELECT
    ein, # not aggregated
    COUNT(ein) AS ein_count # aggregated
FROM `bigquery-public-data.irs_990.irs_990_2015`
GROUP BY ein
ORDER BY ein_count DESC
# There are many charities that have more than one record for tax filing year 2015. This is highly unusual.
# Next let's count how often this happens in total.

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,ein_count
0,431859076,7
1,362235151,7
2,208367574,7
3,830345294,7
4,841604402,7
...,...,...
275072,222629185,1
275073,880275767,1
275074,150539118,1
275075,770057903,1


**Filter aggregation with HAVING clause**

- HAVING is very very useful when we're filtering aggregations.

In [58]:
%%bigquery
SELECT
    ein, # not aggregated
    COUNT(ein) AS ein_count # aggregated
FROM `bigquery-public-data.irs_990.irs_990_2015`
GROUP BY ein
HAVING ein_count > 1
ORDER BY ein_count DESC


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,ein_count
0,431859076,7
1,362235151,7
2,208367574,7
3,830345294,7
4,841604402,7
...,...,...
17989,860593601,2
17990,203502737,2
17991,942324340,2
17992,942662962,2


In [62]:
print(17994/275077*100)

6.541441123758076


**Explore further by filtering on one nonprofit**

- We have seven paper filings for one ein for 2015 which is the tax period of 2014 since you file your taxes a year after the actual tax period.
- So in the 2015 calendar year, we have this ein 2008 to 2014 filinf as well.
- It can be based human error or dirty data or organization is submitting more than one tax period filing.

In [64]:
%%bigquery
SELECT * 
FROM `bigquery-public-data.irs_990.irs_990_2015`
WHERE ein = '262152334'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,elf,tax_pd,subseccd,s501c3or4947a1cd,schdbind,politicalactvtscd,lbbyingactvtscd,subjto6033cd,dnradvisedfundscd,...,exceeds1pct509,subtotpub509,pubsupplesub509,samepubsuppsubtot509,grsinc509,unreltxincls511tx509,subtotsuppinc509,netincunrelatd509,othrinc509,totsupp509
0,262152334,P,201012,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
1,262152334,P,200812,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
2,262152334,P,201412,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
3,262152334,P,201312,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
4,262152334,P,201212,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
5,262152334,P,200912,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
6,262152334,P,201112,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0


- So if we just wanted 2014, we can invoke a date filtering function

In [68]:
%%bigquery
SELECT 
    ein,
    tax_pd,
    PARSE_DATE('%Y%m', CAST(tax_pd AS STRING)) AS tax_period
FROM `bigquery-public-data.irs_990.irs_990_2015`
WHERE 
    EXTRACT(YEAR FROM PARSE_DATE('%Y%m', CAST(tax_pd AS STRING))) = 2014
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,tax_pd,tax_period
0,390123480,201412,2014-12-01
1,900424876,201412,2014-12-01
2,520891669,201405,2014-05-01
3,382227794,201406,2014-06-01
4,361493430,201412,2014-12-01
5,910219435,201412,2014-12-01
6,470339250,201412,2014-12-01
7,940362025,201412,2014-12-01
8,366066772,201412,2014-12-01
9,946069237,201412,2014-12-01


**Handle NULL values with extreme care**

In [70]:
%%bigquery
SELECT 
    ein,
    street,
    city,
    state,
    zip
FROM `bigquery-public-data.irs_990.irs_990_ein`
WHERE 
    state IS NULL
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,street,city,state,zip
0,352651768,NAIROBI,KENYA,,00000-0000
1,980031007,STOCKHOLM 11526,SWEDEN,,00000-0000
2,237069967,TORONTO ON M3J 1P3,CANADA,,00000-0000
3,364867900,BEIJING,CHINA,,00000-0000
4,986064893,BIKENIBEU TARAWA,KIRIBATI,,00000-0000
5,60706038,ST GEORGES GE 01,BERMUDA,,00000-0000
6,61497455,SAVIESE VALAIS CH 1965,SWITZERLAND,,00000-0000
7,980537324,WOLFVILLE NOVA SCOTIA B4P 2R6,CANADA,,00000-0000
8,981426715,OXFORD OX1-1HU,UNITED KINGDOM,,00000-0000
9,237099181,OPERA MI,ITALY,,00000-0000


**Parsing String Values with String Functions**

- `CONCAT("12345","67890")`  -> 1234567890
- `END_WITH("Apple","e")` -> true
- `LOWER("Apple")` -> apple
-  `REGEXP_EXTRACT("Lunchbox",r"^*box$")` -> true




**Wildcard filters with LIKE**

In [71]:
%%bigquery
SELECT 
    ein,
    name
FROM `bigquery-public-data.irs_990.irs_990_ein`
WHERE 
    LOWER(name) LIKE '%help%'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,name
0,203297489,MORGAN AUTO GROUP HELPING HAND FUND INC
1,205763648,JOSH FARLER HELPING HANDS FOUNDATION
2,264779838,HEAVENS HELPING HANDS MINISTRY INC
3,134045651,FRIENDS HELP FRIENDS INC
4,812832750,FAMILY HELP INSTITUTE INC
5,823676844,LOUS HELPING HAND FOUNDATION
6,462519710,BRAY HELPING BRAY INC
7,465025689,SN PHELPS RESEARCH INSTITUTE INC
8,453909386,DARKE COUNTY PREGNANCY HELP CENTER INC
9,830253203,TURNING POINT LINCOLN COUNTYS SELF- HELP CENTER


In [72]:
%%bigquery
SELECT 
    ein,
    name
FROM `bigquery-public-data.irs_990.irs_990_ein`
WHERE 
    LOWER(name) LIKE 'help%'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,name
0,871738298,HELPING HANDS FAMILY OUTREACH INC
1,952758212,HELP FOR BRAIN INJURED CHILDREN INC
2,270477340,HELPNEST INC
3,461652118,HELP A DIABETIC CHILD INC
4,742367192,HELPING HANDS OF JACKSON COUNTY INC
5,461634728,HELPING HIS HANDS DISASTER RESPONSE INC
6,464291756,HELPING KIDS ROUND FIRST
7,472964918,HELP-SIDE FOUNDATION
8,870683277,HELP INTERNATIONAL
9,137409533,HELPING HAND FOUNDATION TR


**Introducing JOINs and UNIONs**

In [76]:
%%bigquery
SELECT 
    *
FROM `bigquery-public-data.noaa_gsod.stations`
LIMIt 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,usaf,wban,name,country,state,call,lat,lon,elev,begin,end
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
3,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
4,8307,99999,WXPOD 8318,AF,,,0.0,0.0,8318.0,20100421,20100421


In [None]:
# you have to create your own primary key

In [74]:
%%bigquery
SELECT 
    COUNT (usaf) AS total_count,
    COUNT(DISTINCT usaf) AS distinct_count
FROM `bigquery-public-data.noaa_gsod.stations`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_count,distinct_count
0,29590,26189


**Union for vertically merging your data**

- `UNION DISTINCT` -> removes duplicates
- `UNION ALL` -> keeps every records

In [81]:
%%bigquery
SELECT 
    stn,wban,temp, year
FROM `bigquery-public-data.noaa_gsod.gsod1929`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1930`


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year
0,037770,99999,50.7,1930
1,038560,99999,53.2,1930
2,038560,99999,58.5,1930
3,039730,99999,40.2,1930
4,039730,99999,57.0,1930
...,...,...,...,...
4526,990061,99999,65.7,1929
4527,990061,99999,69.1,1929
4528,990061,99999,65.1,1929
4529,990061,99999,62.3,1929


**Use table wildcards for easy merges** 

- You have quite a few tables that you need to bring together. This is the syntax for the union, select all the fields we want from one of these particular tables. Union distinct, just write it out and then you just do another table and union distinct, do another table and union distinct, do another table. And if you have more than 10 tables, your fingers are going to get tired of typing all these tables over time. So I don't really want to type a 100 unions and it's going to make my code just extremely long vertically to read. 

```
SELECT 
    stn,wban,temp, year
FROM `bigquery-public-data.noaa_gsod.gsod1929`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1930`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1931`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1932`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1933`
...
..
.
```

- wildcard is your solution ```SELECT * FROM `bigquery-public-data.noaa_gsod.gsod*` ```


In [84]:
%%bigquery
SELECT
    stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod*`
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year
0,150520,99999,18.0,1967
1,150520,99999,52.0,1967
2,150520,99999,58.0,1967
3,150520,99999,8.5,1967
4,151700,99999,16.7,1967
5,151700,99999,59.0,1967
6,151700,99999,66.7,1967
7,170980,99999,33.5,1967
8,170980,99999,29.8,1967
9,170980,99999,52.2,1967


**Filtering with Table wildcard* and__TABLE_SUFFIX__*

- What happens if you wanted to filter out for just a subset of the tables?
- For instance, You just wanted to find and match together all the temperatires from 1950 of after 1950
- Here's another reserved keyword that's spesific to big query.


In [87]:
%%bigquery
SELECT
    stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod*`
--All gsod tables after 1950
WHERE _TABLE_SUFFIX > '1950'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year
0,75550,99999,29.6,2008
1,164650,99999,52.7,2008
2,161780,99999,50.7,2008
3,897620,99999,-24.0,2008
4,895650,99999,33.3,2008
5,897620,99999,-19.8,2008
6,897620,99999,-14.3,2008
7,75550,99999,53.4,2008
8,75550,99999,70.9,2008
9,66090,99999,20.5,2008


- Use table wildcard* versus writing many UNIONs
- Use __TABLE_SUFFIX__ to filter out tables wildcard included
- Use __TABLE_SUFFIX__ in your SELECT statement with CONCAT()

**Avoid UNION pitfalls like brittle schemas**
- Duplicate records among tables (Use UNION DISTINCT instead of UNION ALL)
- Changing schemas and field names over time
- Mismatched count of columns in your UNION

**Linking data across multiple tables** 
- Join and merging datasets
- `INNER JOIN` -> returns only the rows that have matching values in both tables
- `LEFT JOIN` -> returns all rows from the left table, and the matched rows from the right table
- `RIGHT JOIN` -> returns all rows from the right table, and the matched rows from the left table88
- `OUTER JOIN` -> returns all rows from all tables and unmatched rows are displayed as NULL

In [5]:
%%bigquery
SELECT
    a.stn,
    a.wban,
    a.temp, 
    a.year,
    b.name,
    b.state,
    b.country 
FROM 
    `bigquery-public-data.noaa_gsod.gsod*` AS a
JOIN
    `bigquery-public-data.noaa_gsod.stations` AS b
ON a.stn = b.usaf AND a.wban = b.wban
WHERE 
    state IS NOT NULL
    AND country = 'US'
    AND _TABLE_SUFFIX > '2015'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year,name,state,country
0,700635,26465,37.9,2022,GALBRAITH LAKE AIRPORT,AK,US
1,700860,27401,39.3,2022,BARTER ISLAND AIRPORT,AK,US
2,701196,102,54.1,2022,BOB BARKER MEMORIAL AIRPORT,AK,US
3,701333,26643,-18.2,2022,DEERING AIRPORT,AK,US
4,701748,99999,50.0,2022,PROSPECT CREEK AIRPORT,AK,US
5,701748,99999,46.4,2022,PROSPECT CREEK AIRPORT,AK,US
6,702040,26703,31.7,2022,GAMBELL AIRPORT,AK,US
7,702040,26703,28.8,2022,GAMBELL AIRPORT,AK,US
8,702040,26703,32.5,2022,GAMBELL AIRPORT,AK,US
9,702040,26703,28.9,2022,GAMBELL AIRPORT,AK,US


**Avoiding Pitfalls when Merging Datasets**

- Doing many-to-many JOIN could result in more rows than either of your initial tables
- This is a primary reason for exceeding your resource cap in BigQuery
- Know your dataset and relationships between your tables before joining
- Understanf your data relationship before joining 1:1, N:1, 1:N, N:N
- Use CONCAT() to create a composite key fields if no unique fields exist or join on more than one field
- Ensure your keyfields are distinc(deduplicate)

In [8]:
%%bigquery
SELECT ein,elf,tax_pd
FROM 
    `bigquery-public-data.irs_990.irs_990_2015` 
LIMIT 3

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,elf,tax_pd
0,390123480,E,201412
1,390123480,E,201312
2,900424876,E,201412


In [10]:
%%bigquery
SELECT ein,name,ico
FROM 
    `bigquery-public-data.irs_990.irs_990_ein` 
LIMIT 3

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,name,ico
0,660454913,NANA BABY CHILDRENS HOME,
1,660588071,GOLDEN HOOK FISHING CLUB INC,% GREG J FERGUSON
2,900670575,OKINAWA YOUTH FOOTBALL,% MCCS EXECUTIVE BRANCH


### Task 4. Identify a key field in your ecommerce dataset

Examine the products and fields further. You want to become familiar with the products on the website and the fields you could use to potentially join on to other datasets.

#### Examine the records
In this section, you find how many product names and product SKUs are on your website and whether either one of those fields is unique.

Find how many product names and product SKUs are on the website. 
   
- Look at the pagination results in the web UI for the total number of records returned, which in this case is 2,273 products and SKUs.
- But...do the results mean that there are 2,273 unique product SKUs?

In [12]:
%%bigquery
SELECT DISTINCT
productSKU,
v2ProductName
FROM `data-to-insights.ecommerce.all_sessions_raw`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU,v2ProductName
0,9180781,Suitcase Organizer Cubes
1,9180793,26 oz Double Wall Insulated Bottle
2,9180838,Metal Texture Roller Pen
3,9180844,Gunmetal Roller Ball Pen
4,9180905,Google Men's Long Sleeve Raglan Ocean Blue
...,...,...
2268,GGOEGAAL059016,Google Men's Performance Badge Tee Navy
2269,GGOEYAEB030015,YouTube Women's Racer Back Tank Black
2270,10 14152,Rowan Pullover Hood
2271,GGOEGAEJ028516,Google Women's Muscle Tee


In [13]:
%%bigquery
SELECT
DISTINCT
productSKU
FROM `data-to-insights.ecommerce.all_sessions_raw`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU
0,9180750
1,9180793
2,9180833
3,9180838
4,9180844
...,...
1904,GGOEAAQB036117
1905,GGOEGAAC075815
1906,GGOEGAUC057712
1907,10 14157


Hmmm...you have 1,909 distinct SKUs which is less than the 2,273 number for total number of products on the website. The first results probably contain products with duplicate SKUs.

- Take an even closer look at the records. Determine which products have more than one SKU and which SKUs have more than one product.

- let's determine if some product names have more than one SKU. Notice ou use the STRING_AGG() function to aggregate all the product SKUs that are associated with one product name.

In [14]:
%%bigquery
SELECT
DISTINCT
COUNT(DISTINCT productSKU) AS SKU_count,
STRING_AGG(DISTINCT productSKU LIMIT 5) AS SKU,
v2ProductName
FROM `data-to-insights.ecommerce.all_sessions_raw`
WHERE productSKU IS NOT NULL
GROUP BY v2ProductName
HAVING SKU_count > 1
ORDER BY SKU_count DESC
# product name is not unique (expected for variants)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,SKU_count,SKU,v2ProductName
0,12,"9184707,9184710,9184711,GGOEWALJ083416,9184709",Waze Women's Typography Short Sleeve Tee
1,10,"9182739,GGOEGADJ056813,GGOEGADJ056815,GGOEGADJ...",Google Men's Watershed Full Zip Hoodie Grey
2,10,"GGOEGAAX0282,GGOEAAEJ028214,9182759,9182179,91...",Android Women's Short Sleeve Badge Tee Dark He...
3,10,"9180829,GGOEGHGC019799,GGOEGHGH019699,GGOEGHGT...",Google Sunglasses
4,10,"GGOEGAPL058514,GGOEGAPL058513,GGOEGAPL058516,G...",Google Women's Insulated Thermal Vest Navy
...,...,...,...
488,2,"9180752,GGOEADHJ015599",Android 24 oz Button Lid Sport Water Bottle Smoke
489,2,"GGOEGHPL003113,9181574",Google Stretch Fit Hat S/M Navy
490,2,"9180751,GGOEADHH015499",Android 24 oz Button Lid Sport Water Bottle Green
491,2,"GGOEGAAX0579,9182773",Google Women's Badge Performance Tee Charcoal


Do some product names have more than one SKU? Look at the query results to confirm.

Answer: Yes

It may also be true that one product name be associated with more than one SKU. This can happen due to variation. For example, one product name (e.g. T-Shirt) can have multiple product variants like color, size, etc. You would expect one product to have many SKUs.



In [15]:
%%bigquery
SELECT
DISTINCT
COUNT(DISTINCT v2ProductName) AS product_count,
STRING_AGG(DISTINCT v2ProductName LIMIT 5) AS product_name,
productSKU
FROM `data-to-insights.ecommerce.all_sessions_raw`
WHERE v2ProductName IS NOT NULL
GROUP BY productSKU
HAVING product_count > 1
ORDER BY product_count DESC
# SKU is not unique (indicates data quality issues)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,product_count,product_name,productSKU
0,3,"7&quot; Dog Frisbee,Google 7-inch Dog Flying D...",GGOEGAAX0098
1,3,"Waterpoof Gear Bag,Waterproof Gear Bag,Google ...",GGOEGBMC056599
2,3,"Set of 3 Nested Travel Cases,BRIGHTtravels Set...",GGOEGCLB020832
3,3,"Micro Wireless Earbud,Micro Wireless Earbuds,A...",GGOEGEVA022399
4,3,"Nest® Learning Thermostat 3rd Gen-USA,Nest® Le...",GGOENEBJ079499
...,...,...,...
342,2,Google Women's Short Sleeve Performance Tee Pe...,9182769
343,2,Google Men's Short Sleeve Performance Badge Te...,9182747
344,2,"Google Men's Performance Badge Tee Navy,Google...",9182749
345,2,"YouTube Men's Short Sleeve Hero Tee White,You...",GGOEYAAQ031717


 it looks like there are quite a few SKUs that have more than one product name. Several of the product names appear to be closely related with a few misspellings (e.g. Waterproof Gear Bag vs Waterproof Gear Bag).

You see why this could be an issue in the next section.

A SKU is designed to uniquely identify one product and will be the basis of your join condition when you join against other tables. Having a non-unique key can cause serious data issues.

Write a query to identify all the product names for the SKU 'GGOEGPJC019099'.


In [16]:
%%bigquery
SELECT DISTINCT
v2ProductName,
productSKU
FROM `data-to-insights.ecommerce.all_sessions_raw`
WHERE productSKU = 'GGOEGPJC019099'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,v2ProductName,productSKU
0,"7"" Dog Frisbee",GGOEGPJC019099
1,7&quot; Dog Frisbee,GGOEGPJC019099
2,Google 7-inch Dog Flying Disc Blue,GGOEGPJC019099


From the query results, it looks like there are three different names for the same product. In this example, there is a special character in one name and a slightly different name for another:

See the impact of joining on a dataset with multiple products for a single SKU. First, explore the product inventory dataset (the products table) to see if this SKU is unique there.

In [19]:
%%bigquery
#standardSQL
# join in another table
# products (has inventory)
SELECT * FROM `data-to-insights.ecommerce.products`
WHERE SKU = 'GGOEGPJC019099'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,SKU,name,orderedQuantity,stockLevel,restockingLeadTime,sentimentScore,sentimentMagnitude
0,GGOEGPJC019099,"7"" Dog Frisbee",133,154,11,0.8,0.3


**Join pitfall: Unintentional many-to-one SKU relationship**

Next, join the inventory dataset against your website product names and SKUs so you can have the inventory stock level associated with each product for sale on the website.

In [20]:
%%bigquery
SELECT DISTINCT
website.v2ProductName,
website.productSKU,
inventory.stockLevel
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU
WHERE productSKU = 'GGOEGPJC019099'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,v2ProductName,productSKU,stockLevel
0,Google 7-inch Dog Flying Disc Blue,GGOEGPJC019099,154
1,"7"" Dog Frisbee",GGOEGPJC019099,154
2,7&quot; Dog Frisbee,GGOEGPJC019099,154


What happens when you join the website table and the product inventory table on SKU? Do you now have inventory stock levels for the product?

Answer: Yes but the stockLevel is showing three times (one for each record)!

Next, run a query that shows the total stock level for each item in inventory.



In [21]:
%%bigquery
#standardSQL
SELECT
  productSKU,
  SUM(stockLevel) AS total_inventory
FROM (
  SELECT DISTINCT
  website.v2ProductName,
  website.productSKU,
  inventory.stockLevel
  FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
  JOIN `data-to-insights.ecommerce.products` AS inventory
  ON website.productSKU = inventory.SKU
  WHERE productSKU = 'GGOEGPJC019099'
)
GROUP BY productSKU

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU,total_inventory
0,GGOEGPJC019099,462


It is 154 x 3 = 462 or triple counting. This is called an unintentional cross join



**Join pitfall solution: use distinct SKUs before joining**
What are the options to solve your triple counting dilemma? First, you need to only select distinct SKUs from the website before joining on other datasets.

Write a query to return the count of distinct productSKU from data-to-insights.ecommerce.all_sessions_raw.


In [22]:
%%bigquery
#standardSQL
SELECT
COUNT(DISTINCT website.productSKU) AS distinct_sku_count
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,distinct_sku_count
0,1909


**Join pitfall: Losing data records after a join**

Now you're ready to join against your product inventory dataset again.

In [23]:
%%bigquery
#standardSQL
SELECT DISTINCT
website.productSKU
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU
0,9180793
1,9180833
2,9180838
3,9180840
4,9180844
...,...
1085,GGOEGAEJ028916
1086,GGOEGAUC057717
1087,GGOEAAWT061754
1088,GGOEGALJ072916


How many records were returned? All 1,909 distinct SKUs?

Answer: No, just 1,090 records

You lost 819 SKUs after joining the datasets, investigate by adding more specificity in your fields.

In [24]:
%%bigquery
#standardSQL
# pull ID fields from both tables
SELECT DISTINCT
website.productSKU AS website_SKU,
inventory.SKU AS inventory_SKU
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU
# IDs are present in both tables, how can we dig deeper?

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,website_SKU,inventory_SKU
0,9180793,9180793
1,9180824,9180824
2,9181019,9181019
3,9182502,9182502
4,9182569,9182569
...,...,...
1085,GGOEYAEJ029017,GGOEYAEJ029017
1086,9184683,9184683
1087,GGOEAAEB028316,GGOEAAEB028316
1088,GGOEAAWT061754,GGOEAAWT061754


It appears the SKUs are present in both of those datasets after the join.

Join pitfall solution: Selecting the correct join type and filtering for NULL
The default JOIN type is an INNER JOIN which returns records only if there is a match on both the left and the right tables that are joined.

Rewrite the previous query to use a different join type to include all records from the website table, regardless of whether there is a match on a product inventory SKU record. Join type options: INNER JOIN, LEFT JOIN, RIGHT JOIN, FULL JOIN, CROSS JOIN


In [25]:
%%bigquery
#standardSQL
# the secret is in the JOIN type
# pull ID fields from both tables
SELECT DISTINCT
website.productSKU AS website_SKU,
inventory.SKU AS inventory_SKU
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
LEFT JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,website_SKU,inventory_SKU
0,9180793,9180793
1,9180824,9180824
2,9181019,9181019
3,9182569,9182569
4,9182593,9182593
...,...,...
1904,9182181,
1905,10 84023,
1906,10 14159,
1907,GGOEGAXC074829,


Write a query to filter on NULL values from the inventory table.

In [26]:
%%bigquery
#standardSQL
# find product SKUs in website table but not in product inventory table
SELECT DISTINCT
website.productSKU AS website_SKU,
inventory.SKU AS inventory_SKU
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
LEFT JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU
WHERE inventory.SKU IS NULL

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,website_SKU,inventory_SKU
0,9182668,
1,9182838,
2,9183211,
3,GGOEGCGB022199,
4,GGOEGBJR018199,
...,...,...
814,GGOEGAXC074855,
815,GGOEGAPJ058017,
816,GGOEGAXC074829,
817,10 75145,


to confirm using one of the specific SKUs from the website dataset:



In [28]:
%%bigquery
#standardSQL
# you can even pick one and confirm
SELECT * FROM `data-to-insights.ecommerce.products`
WHERE SKU = 'GGOEGATJ060517'
# query returns zero results

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,SKU,name,orderedQuantity,stockLevel,restockingLeadTime,sentimentScore,sentimentMagnitude


Why might the product inventory dataset be missing SKUs?

Answer: Unfortunately, there is no easy answer. It is most likely a business-related question:

- Some SKUs could be digital products that you don't store in inventory
- Old products you sold in past website orders are no longer offered in current inventory
- Legitimate missing data from inventory and should be tracked

Are there any products are in the product inventory dataset but missing from the website?

Write a query using a different join type to investigate.

In [29]:
%%bigquery
#standardSQL
# reverse the join
# find records in website but not in inventory
SELECT DISTINCT
website.productSKU AS website_SKU,
inventory.SKU AS inventory_SKU
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
RIGHT JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU
WHERE website.productSKU IS NULL

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,website_SKU,inventory_SKU
0,,GGOBJGOWUSG69402
1,,GGADFBSBKS42347


Answer: Yes. There are two product SKUs missing from the website dataset

Next, add more fields from the product inventory dataset for more details.

In [30]:
%%bigquery
#standardSQL
# what are these products?
# add more fields in the SELECT STATEMENT
SELECT DISTINCT
website.productSKU AS website_SKU,
inventory.*
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
RIGHT JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU
WHERE website.productSKU IS NULL

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,website_SKU,SKU,name,orderedQuantity,stockLevel,restockingLeadTime,sentimentScore,sentimentMagnitude
0,,GGOBJGOWUSG69402,USB wired soundbar - in store only,10,15,2,1.0,1.0
1,,GGADFBSBKS42347,PC gaming speakers,0,100,1,,


- One new product (no orders, no sentimentScore) and one product that is "in store only"
- Another is a new product with 0 orders

Why would the new product not show up on your website dataset?

- The website dataset is past order transactions by customers brand new products which have never been sold won't show up in web analytics until they're viewed or purchased

**Note:** You typically will not see RIGHT JOINs in production queries. You would simply just do a LEFT JOIN and switch the ordering of the tables.

What if you wanted one query that listed all products missing from either the website or inventory?

In [31]:
%%bigquery
#standardSQL
SELECT DISTINCT
website.productSKU AS website_SKU,
inventory.SKU AS inventory_SKU
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
FULL JOIN `data-to-insights.ecommerce.products` AS inventory
ON website.productSKU = inventory.SKU
WHERE website.productSKU IS NULL OR inventory.SKU IS NULL

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,website_SKU,inventory_SKU
0,9183074,
1,GGOEGATB060418,
2,GGOEYAAB034615,
3,9183167,
4,GGOEAAEB031216,
...,...,...
816,GGOEGAEJ030717,
817,9180810,
818,9182770,
819,10 52246,


You have your 819 + 2 = 821 product SKUs

LEFT JOIN + RIGHT JOIN = FULL JOIN which returns all records from both tables regardless of matching join keys. You then filter out where you have mismatches on either side

**Join pitfall: Unintentional Cross Join** 

Not knowing the relationship between data table keys (1:1, 1:N, N:N) can return unexpected results and also significantly reduce query performance.

The last join type is the CROSS JOIN.

Create a new table with a site-wide discount percent that you want to apply across products in the Clearance category.

Replacing the table named dataset-***.




In [32]:
%%bigquery
#standardSQL
CREATE OR REPLACE TABLE ecommerce.site_wide_promotion AS
SELECT .05 AS discount;

Query is running:   0%|          |

In the left pane, site_wide_promotion is now listed in the Resource section under qwiklabs-gcp-xxx > ecommerce.

Copy and paste the below query to find out how many products are in clearance:



In [33]:
%%bigquery
SELECT DISTINCT
productSKU,
v2ProductCategory,
discount
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
CROSS JOIN ecommerce.site_wide_promotion
WHERE v2ProductCategory LIKE '%Clearance%'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU,v2ProductCategory,discount
0,GGOEGAAX0168,Home/Clearance Sale/,0.05
1,GGOEAHPA004110,Home/Clearance Sale/,0.05
2,GGOEGOAB021499,Home/Clearance Sale/,0.05
3,GGOEGOAJ021599,Home/Clearance Sale/,0.05
4,GGOEYOCR077399,Home/Clearance Sale/,0.05
...,...,...,...
77,GGOEGAAX0566,Home/Clearance Sale/,0.05
78,GGOEGAAX0595,Home/Clearance Sale/,0.05
79,GGOEGAAX0570,Home/Clearance Sale/,0.05
80,GGOEGAAX0606,Home/Clearance Sale/,0.05


**Note:** In the syntax there is no join condition (e.g. ON or USING) for a CROSS JOIN. The field is simply multiplied against the first dataset or .05 discount across all items.
Let's see the impact of unintentionally adding more than one record in the discount table.

Copy and paste the below query to insert two more records into the promotion table:



In [34]:
%%bigquery
#standardSQL
INSERT INTO ecommerce.site_wide_promotion (discount)
VALUES (.04),
       (.03);
       

Query is running:   0%|          |

Next let's view the data values in the promotion table.

In [35]:
%%bigquery
#standardSQL
SELECT discount FROM ecommerce.site_wide_promotion

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,discount
0,0.03
1,0.04
2,0.05


What happens when you apply the discount again across all 82 clearance products?

In [36]:
%%bigquery
#standardSQL
# now what happens:
SELECT DISTINCT
productSKU,
v2ProductCategory,
discount
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
CROSS JOIN ecommerce.site_wide_promotion
WHERE v2ProductCategory LIKE '%Clearance%'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU,v2ProductCategory,discount
0,GGOEGAAX0571,Home/Clearance Sale/,0.05
1,GGOEAOCB077499,Home/Clearance Sale/,0.04
2,GGOEGCBB074399,Home/Clearance Sale/,0.03
3,GGOEGESQ016799,Home/Clearance Sale/,0.03
4,GGOEGOCB017499,Home/Clearance Sale/,0.03
...,...,...,...
241,GGOEGAAX0351,Home/Clearance Sale/,0.05
242,GGOEGAAX0331,Home/Clearance Sale/,0.03
243,GGOEGAAX0327,Home/Clearance Sale/,0.05
244,GGOEGOAB012999,Home/Clearance Sale/,0.03


How many products are returned?

Answer: Instead of 82, you now have 246 returned which is more records than your original table started with.

Let's investigate the underlying cause by examining one product SKU.

In [37]:
%%bigquery
#standardSQL
SELECT DISTINCT
productSKU,
v2ProductCategory,
discount
FROM `data-to-insights.ecommerce.all_sessions_raw` AS website
CROSS JOIN ecommerce.site_wide_promotion
WHERE v2ProductCategory LIKE '%Clearance%'
AND productSKU = 'GGOEGOLC013299'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,productSKU,v2ProductCategory,discount
0,GGOEGOLC013299,Home/Clearance Sale/,0.03
1,GGOEGOLC013299,Home/Clearance Sale/,0.05
2,GGOEGOLC013299,Home/Clearance Sale/,0.04


What was the impact of the CROSS JOIN?

Answer:

Since there are 3 discount codes to cross join on, you are multiplying the original dataset by 3.

Note: This behavior isn't limited to cross joins, with a normal join you can unintentionally cross join when the data relationships are many-to-many this can easily result in returning millions or even billions of records unintentionally.
The solution is to know your data relationships before you join and don't assume keys are unique.

**Task 7. Deduplicating rows**

At the start of the lab you wrote a query that showed multiple product names for a single SKU. Deduplicating records like this is a common skill for data analysts. Examine one way you can select only one product per SKU.

First, start with the query to show all product names per SKU.



In [38]:
%%bigquery
# recall the earlier query that showed multiple product_names for each SKU
SELECT
DISTINCT
COUNT(DISTINCT v2ProductName) AS product_count,
STRING_AGG(DISTINCT v2ProductName LIMIT 5) AS product_name,
productSKU
FROM `data-to-insights.ecommerce.all_sessions_raw`
WHERE v2ProductName IS NOT NULL
GROUP BY productSKU
HAVING product_count > 1
ORDER BY product_count DESC

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,product_count,product_name,productSKU
0,3,"Softsided Travel Pouch Set,Set of 3 Nested Tra...",GGOEGCLB020832
1,3,"7"" Dog Frisbee,7&quot; Dog Frisbee,Google 7-in...",GGOEGAAX0098
2,3,Nest® Learning Thermostat 3rd Gen-USA - Stainl...,GGOENEBJ079499
3,3,"7"" Dog Frisbee,7&quot; Dog Frisbee,Google 7-in...",GGOEGPJR018999
4,3,YouTube Women's Short Sleeve Hero Tee Charcoal...,GGOEYAEJ029015
...,...,...,...
342,2,"Google Collapsible Pet Bowl,Collapsible Pet Bowl",9182909
343,2,Google Women's Short Sleeve Badge Tee Ice Gray...,9182647
344,2,Google Women's Short Sleeve Performance Tee Bl...,GGOEGALB059916
345,2,"Collapsible Pet Bowl,Google Collapsible Pet Bowl",9180872


Since most of the product names are extremely similar (and you want to map a single SKU to a single product), write a query to only choose one of the product_names. You will be using this StackOverflow post by Felipe Hoffa as inspiration.



In [39]:
%%bigquery
# take the one name associated with a SKU
WITH product_query AS (
  SELECT
  DISTINCT
  v2ProductName,
  productSKU
  FROM `data-to-insights.ecommerce.all_sessions_raw`
  WHERE v2ProductName IS NOT NULL
)
SELECT k.* FROM (
  # aggregate the products into an array and
  # only take 1 result
  SELECT ARRAY_AGG(x LIMIT 1)[OFFSET(0)] k
  FROM product_query x
  GROUP BY productSKU # this is the field you want deduplicated
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,v2ProductName,productSKU
0,26 oz Double Wall Insulated Bottle,9180793
1,Rubber Grip Ballpoint Pen 4 Pack,9180833
2,Metal Texture Roller Pen,9180838
3,Maze Pen,9180842
4,Gunmetal Roller Ball Pen,9180844
...,...,...
1904,Android Women's Fleece Hoodie,GGOEAAQB036117
1905,Google Adult Tee Fruit Games Cherries,GGOEGAAC075815
1906,Google Women's Insulated Thermal Vest Navy,GGOEGAPL058516
1907,Eco-Jersey Chrome Zip Up Hoodie,10 14159
