In [1]:
# BigQuery Setup
# Importing Libraries and Credentials
import pandas as pd
import numpy as np
import seaborn as sns
from google.cloud import bigquery
from google.oauth2 import service_account
# ignore warnings
from warnings import filterwarnings
filterwarnings("ignore")


%load_ext google.cloud.bigquery

credentials = service_account.Credentials.from_service_account_file('/Users/ssamilozkan/Desktop/BigQuery/config.json')

project_id = 'dbt-bigquery-setup-369911'
client = bigquery.Client(credentials= credentials, project=project_id)


In [6]:
%%bigquery
SELECT  totrevenue
FROM `bigquery-public-data.irs_990.irs_990_2015`
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,totrevenue
0,9475129863
1,9021585970
2,9890722789
3,1094833976
4,2186337569
5,2086259022
6,1745011054
7,1711501686
8,507813618
9,1229151613


`FORMAT("%'d", totrevenue)`

Function = Performs an Action

Parameters = Inputs you provide

`SELECT FORMAT ("%'d", 1000)`
-> returns '1,000'

**Beware of stylistic formatting in SQL:**

    - Yout output is now treated like a string. This makes math operations on this calculated field more difficult.
    
    - It's best to save stylistic elements for your visualization tool.

In [19]:
%%bigquery
SELECT 
    FORMAT("%'d", totrevenue) AS revenue
FROM `bigquery-public-data.irs_990.irs_990_2015`
ORDER BY totrevenue DESC
LIMIT  10
# It's much easier to read the numbers with commas
# But there may be couple of caveats that come with that

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue
0,45409123226
1,20796549014
2,11091388129
3,10098163008
4,9890722789
5,9475129863
6,9021585970
7,8655129029
8,7523260077
9,6740015230


**Aliases do not exist yet when filtering in WHERE**

In [41]:
%%bigquery
SELECT 
    (totrevenue - totfuncexpns) AS income
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
WHERE income > 0 # Does not exist, will error in WHERE clause
ORDER BY income DESC
LIMIT 10

Executing query with job ID: 31d0f574-fde3-420e-9c03-086a9336b882
Query executing: 0.68s


ERROR:
 400 Unrecognized name: income at [5:7]

Location: US
Job ID: 31d0f574-fde3-420e-9c03-086a9336b882



**Add new fields in SELECT clause to return more data**

In [44]:
# EIN(employer identification number) is a unique identifier for that charity
# is_school is a flag field indicating whether that charity is a school
%%bigquery
SELECT 
    totrevenue AS revenue,
    ein,
    operateschools170cd AS is_school
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
ORDER BY revenue DESC
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue,ein,is_school
0,45409123226,941340523,N
1,20796549014,941105628,N
2,11091388129,900656139,N
3,10098163008,208295721,N
4,9890722789,900424876,N
5,9475129863,390123480,N
6,9021585970,390123480,N
7,8655129029,941196203,N
8,7523260077,912153073,N
9,6740015230,42103580,Y


### Filters, aggregates, and duplicates


In [49]:
%%bigquery
SELECT 
    totrevenue AS revenue,
    ein,
    operateschools170cd AS is_school
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
WHERE 
    operateschools170cd = 'Y'
ORDER BY revenue DESC
LIMIT 10
# Why didn't we write this as is_school = 'Y'?

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue,ein,is_school
0,6740015230,42103580,Y
1,6000839000,231352685,Y
2,5717023246,941156365,Y
3,5569004000,520595110,Y
4,5133788413,135562308,Y
5,4623485966,951642394,Y
6,4560196033,416011702,Y
7,4477633568,60646973,Y
8,4471027733,135598093,Y
9,4368738915,150532082,Y


**Perform calculations over values with aggregation**

In [50]:
%%bigquery
SELECT 
    SUM(totrevenue) AS total_2015_revenue,
    AVG(totrevenue) AS avg_revenue,
    COUNT(ein) AS nonprofits,
    COUNT(DISTINCT ein) AS nonprofits_distinct,
    MAX(noemplyeesw3cnt) AS num_employees
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_2015_revenue,avg_revenue,nonprofits,nonprofits_distinct,num_employees
0,2344355088288,7952843.0,294782,275077,787050


**Embed functions inside of other functions**

In [52]:
%%bigquery
SELECT 
    SUM(totrevenue) AS total_2015_revenue,
    ROUND(AVG(totrevenue),2) AS avg_revenue, ## we can round the average revenue
    COUNT(ein) AS nonprofits,
    COUNT(DISTINCT ein) AS nonprofits_distinct,
    MAX(noemplyeesw3cnt) AS num_employees
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_2015_revenue,avg_revenue,nonprofits,nonprofits_distinct,num_employees
0,2344355088288,7952843.42,294782,275077,787050


In [56]:
%%bigquery
SELECT
    ein, # not aggregated
    COUNT(ein) AS ein_count # aggregated
FROM `bigquery-public-data.irs_990.irs_990_2015`
GROUP BY ein
ORDER BY ein_count DESC
# There are many charities that have more than one record for tax filing year 2015. This is highly unusual.
# Next let's count how often this happens in total.

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,ein_count
0,431859076,7
1,362235151,7
2,208367574,7
3,830345294,7
4,841604402,7
...,...,...
275072,222629185,1
275073,880275767,1
275074,150539118,1
275075,770057903,1


**Filter aggregation with HAVING clause**

- HAVING is very very useful when we're filtering aggregations.

In [58]:
%%bigquery
SELECT
    ein, # not aggregated
    COUNT(ein) AS ein_count # aggregated
FROM `bigquery-public-data.irs_990.irs_990_2015`
GROUP BY ein
HAVING ein_count > 1
ORDER BY ein_count DESC


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,ein_count
0,431859076,7
1,362235151,7
2,208367574,7
3,830345294,7
4,841604402,7
...,...,...
17989,860593601,2
17990,203502737,2
17991,942324340,2
17992,942662962,2


In [62]:
print(17994/275077*100)

6.541441123758076


**Explore further by filtering on one nonprofit**

- We have seven paper filings for one ein for 2015 which is the tax period of 2014 since you file your taxes a year after the actual tax period.
- So in the 2015 calendar year, we have this ein 2008 to 2014 filinf as well.
- It can be based human error or dirty data or organization is submitting more than one tax period filing.

In [64]:
%%bigquery
SELECT * 
FROM `bigquery-public-data.irs_990.irs_990_2015`
WHERE ein = '262152334'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,elf,tax_pd,subseccd,s501c3or4947a1cd,schdbind,politicalactvtscd,lbbyingactvtscd,subjto6033cd,dnradvisedfundscd,...,exceeds1pct509,subtotpub509,pubsupplesub509,samepubsuppsubtot509,grsinc509,unreltxincls511tx509,subtotsuppinc509,netincunrelatd509,othrinc509,totsupp509
0,262152334,P,201012,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
1,262152334,P,200812,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
2,262152334,P,201412,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
3,262152334,P,201312,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
4,262152334,P,201212,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
5,262152334,P,200912,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
6,262152334,P,201112,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0


- So if we just wanted 2014, we can invoke a date filtering function

In [68]:
%%bigquery
SELECT 
    ein,
    tax_pd,
    PARSE_DATE('%Y%m', CAST(tax_pd AS STRING)) AS tax_period
FROM `bigquery-public-data.irs_990.irs_990_2015`
WHERE 
    EXTRACT(YEAR FROM PARSE_DATE('%Y%m', CAST(tax_pd AS STRING))) = 2014
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,tax_pd,tax_period
0,390123480,201412,2014-12-01
1,900424876,201412,2014-12-01
2,520891669,201405,2014-05-01
3,382227794,201406,2014-06-01
4,361493430,201412,2014-12-01
5,910219435,201412,2014-12-01
6,470339250,201412,2014-12-01
7,940362025,201412,2014-12-01
8,366066772,201412,2014-12-01
9,946069237,201412,2014-12-01


**Handle NULL values with extreme care**

In [70]:
%%bigquery
SELECT 
    ein,
    street,
    city,
    state,
    zip
FROM `bigquery-public-data.irs_990.irs_990_ein`
WHERE 
    state IS NULL
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,street,city,state,zip
0,352651768,NAIROBI,KENYA,,00000-0000
1,980031007,STOCKHOLM 11526,SWEDEN,,00000-0000
2,237069967,TORONTO ON M3J 1P3,CANADA,,00000-0000
3,364867900,BEIJING,CHINA,,00000-0000
4,986064893,BIKENIBEU TARAWA,KIRIBATI,,00000-0000
5,60706038,ST GEORGES GE 01,BERMUDA,,00000-0000
6,61497455,SAVIESE VALAIS CH 1965,SWITZERLAND,,00000-0000
7,980537324,WOLFVILLE NOVA SCOTIA B4P 2R6,CANADA,,00000-0000
8,981426715,OXFORD OX1-1HU,UNITED KINGDOM,,00000-0000
9,237099181,OPERA MI,ITALY,,00000-0000


**Parsing String Values with String Functions**

- `CONCAT("12345","67890")`  -> 1234567890
- `END_WITH("Apple","e")` -> true
- `LOWER("Apple")` -> apple
-  `REGEXP_EXTRACT("Lunchbox",r"^*box$")` -> true




**Wildcard filters with LIKE**

In [71]:
%%bigquery
SELECT 
    ein,
    name
FROM `bigquery-public-data.irs_990.irs_990_ein`
WHERE 
    LOWER(name) LIKE '%help%'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,name
0,203297489,MORGAN AUTO GROUP HELPING HAND FUND INC
1,205763648,JOSH FARLER HELPING HANDS FOUNDATION
2,264779838,HEAVENS HELPING HANDS MINISTRY INC
3,134045651,FRIENDS HELP FRIENDS INC
4,812832750,FAMILY HELP INSTITUTE INC
5,823676844,LOUS HELPING HAND FOUNDATION
6,462519710,BRAY HELPING BRAY INC
7,465025689,SN PHELPS RESEARCH INSTITUTE INC
8,453909386,DARKE COUNTY PREGNANCY HELP CENTER INC
9,830253203,TURNING POINT LINCOLN COUNTYS SELF- HELP CENTER


In [72]:
%%bigquery
SELECT 
    ein,
    name
FROM `bigquery-public-data.irs_990.irs_990_ein`
WHERE 
    LOWER(name) LIKE 'help%'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,name
0,871738298,HELPING HANDS FAMILY OUTREACH INC
1,952758212,HELP FOR BRAIN INJURED CHILDREN INC
2,270477340,HELPNEST INC
3,461652118,HELP A DIABETIC CHILD INC
4,742367192,HELPING HANDS OF JACKSON COUNTY INC
5,461634728,HELPING HIS HANDS DISASTER RESPONSE INC
6,464291756,HELPING KIDS ROUND FIRST
7,472964918,HELP-SIDE FOUNDATION
8,870683277,HELP INTERNATIONAL
9,137409533,HELPING HAND FOUNDATION TR


**Introducing JOINs and UNIONs**

In [76]:
%%bigquery
SELECT 
    *
FROM `bigquery-public-data.noaa_gsod.stations`
LIMIt 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,usaf,wban,name,country,state,call,lat,lon,elev,begin,end
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
3,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
4,8307,99999,WXPOD 8318,AF,,,0.0,0.0,8318.0,20100421,20100421


In [None]:
# you have to create your own primary key

In [74]:
%%bigquery
SELECT 
    COUNT (usaf) AS total_count,
    COUNT(DISTINCT usaf) AS distinct_count
FROM `bigquery-public-data.noaa_gsod.stations`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_count,distinct_count
0,29590,26189


**Union for vertically merging your data**

- `UNION DISTINCT` -> removes duplicates
- `UNION ALL` -> keeps every records

In [81]:
%%bigquery
SELECT 
    stn,wban,temp, year
FROM `bigquery-public-data.noaa_gsod.gsod1929`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1930`


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year
0,037770,99999,50.7,1930
1,038560,99999,53.2,1930
2,038560,99999,58.5,1930
3,039730,99999,40.2,1930
4,039730,99999,57.0,1930
...,...,...,...,...
4526,990061,99999,65.7,1929
4527,990061,99999,69.1,1929
4528,990061,99999,65.1,1929
4529,990061,99999,62.3,1929


**Use table wildcards for easy merges** 

- You have quite a few tables that you need to bring together. This is the syntax for the union, select all the fields we want from one of these particular tables. Union distinct, just write it out and then you just do another table and union distinct, do another table and union distinct, do another table. And if you have more than 10 tables, your fingers are going to get tired of typing all these tables over time. So I don't really want to type a 100 unions and it's going to make my code just extremely long vertically to read. 

```
SELECT 
    stn,wban,temp, year
FROM `bigquery-public-data.noaa_gsod.gsod1929`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1930`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1931`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1932`
    UNION DISTINCT
SELECT stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod1933`
...
..
.
```

- wildcard is your solution ```SELECT * FROM `bigquery-public-data.noaa_gsod.gsod*` ```


In [84]:
%%bigquery
SELECT
    stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod*`
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year
0,150520,99999,18.0,1967
1,150520,99999,52.0,1967
2,150520,99999,58.0,1967
3,150520,99999,8.5,1967
4,151700,99999,16.7,1967
5,151700,99999,59.0,1967
6,151700,99999,66.7,1967
7,170980,99999,33.5,1967
8,170980,99999,29.8,1967
9,170980,99999,52.2,1967


**Filtering with Table wildcard* and__TABLE_SUFFIX__*

- What happens if you wanted to filter out for just a subset of the tables?
- For instance, You just wanted to find and match together all the temperatires from 1950 of after 1950
- Here's another reserved keyword that's spesific to big query.


In [87]:
%%bigquery
SELECT
    stn,wban,temp, year 
FROM `bigquery-public-data.noaa_gsod.gsod*`
--All gsod tables after 1950
WHERE _TABLE_SUFFIX > '1950'
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stn,wban,temp,year
0,75550,99999,29.6,2008
1,164650,99999,52.7,2008
2,161780,99999,50.7,2008
3,897620,99999,-24.0,2008
4,895650,99999,33.3,2008
5,897620,99999,-19.8,2008
6,897620,99999,-14.3,2008
7,75550,99999,53.4,2008
8,75550,99999,70.9,2008
9,66090,99999,20.5,2008


- Use table wildcard* versus writing many UNIONs
- Use __TABLE_SUFFIX__ to filter out tables wildcard included
- Use __TABLE_SUFFIX__ in your SELECT statement with CONCAT()

**Avoid UNION pitfalls like brittle schemas**
- Duplicate records among tables (Use UNION DISTINCT instead of UNION ALL)
- Changing schemas and field names over time
- Mismatched count of columns in your UNION