In [1]:
# BigQuery Setup
# Importing Libraries and Credentials
import pandas as pd
import numpy as np
import seaborn as sns
from google.cloud import bigquery
from google.oauth2 import service_account
# ignore warnings
from warnings import filterwarnings
filterwarnings("ignore")


%load_ext google.cloud.bigquery

credentials = service_account.Credentials.from_service_account_file('/Users/ssamilozkan/Desktop/BigQuery/config.json')

project_id = 'dbt-bigquery-setup-369911'
client = bigquery.Client(credentials= credentials, project=project_id)


In [6]:
%%bigquery
SELECT  totrevenue
FROM `bigquery-public-data.irs_990.irs_990_2015`
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,totrevenue
0,9475129863
1,9021585970
2,9890722789
3,1094833976
4,2186337569
5,2086259022
6,1745011054
7,1711501686
8,507813618
9,1229151613


`FORMAT("%'d", totrevenue)`

Function = Performs an Action

Parameters = Inputs you provide

`SELECT FORMAT ("%'d", 1000)`
-> returns '1,000'

**Beware of stylistic formatting in SQL:**

    - Yout output is now treated like a string. This makes math operations on this calculated field more difficult.
    
    - It's best to save stylistic elements for your visualization tool.

In [19]:
%%bigquery
SELECT 
    FORMAT("%'d", totrevenue) AS revenue
FROM `bigquery-public-data.irs_990.irs_990_2015`
ORDER BY totrevenue DESC
LIMIT  10
# It's much easier to read the numbers with commas
# But there may be couple of caveats that come with that

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue
0,45409123226
1,20796549014
2,11091388129
3,10098163008
4,9890722789
5,9475129863
6,9021585970
7,8655129029
8,7523260077
9,6740015230


**Aliases do not exist yet when filtering in WHERE**

In [41]:
%%bigquery
SELECT 
    (totrevenue - totfuncexpns) AS income
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
WHERE income > 0 # Does not exist, will error in WHERE clause
ORDER BY income DESC
LIMIT 10

Executing query with job ID: 31d0f574-fde3-420e-9c03-086a9336b882
Query executing: 0.68s


ERROR:
 400 Unrecognized name: income at [5:7]

Location: US
Job ID: 31d0f574-fde3-420e-9c03-086a9336b882



**Add new fields in SELECT clause to return more data**

In [44]:
# EIN(employer identification number) is a unique identifier for that charity
# is_school is a flag field indicating whether that charity is a school
%%bigquery
SELECT 
    totrevenue AS revenue,
    ein,
    operateschools170cd AS is_school
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
ORDER BY revenue DESC
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue,ein,is_school
0,45409123226,941340523,N
1,20796549014,941105628,N
2,11091388129,900656139,N
3,10098163008,208295721,N
4,9890722789,900424876,N
5,9475129863,390123480,N
6,9021585970,390123480,N
7,8655129029,941196203,N
8,7523260077,912153073,N
9,6740015230,42103580,Y


### Filters, aggregates, and duplicates


In [49]:
%%bigquery
SELECT 
    totrevenue AS revenue,
    ein,
    operateschools170cd AS is_school
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`
WHERE 
    operateschools170cd = 'Y'
ORDER BY revenue DESC
LIMIT 10
# Why didn't we write this as is_school = 'Y'?

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,revenue,ein,is_school
0,6740015230,42103580,Y
1,6000839000,231352685,Y
2,5717023246,941156365,Y
3,5569004000,520595110,Y
4,5133788413,135562308,Y
5,4623485966,951642394,Y
6,4560196033,416011702,Y
7,4477633568,60646973,Y
8,4471027733,135598093,Y
9,4368738915,150532082,Y


**Perform calculations over values with aggregation**

In [50]:
%%bigquery
SELECT 
    SUM(totrevenue) AS total_2015_revenue,
    AVG(totrevenue) AS avg_revenue,
    COUNT(ein) AS nonprofits,
    COUNT(DISTINCT ein) AS nonprofits_distinct,
    MAX(noemplyeesw3cnt) AS num_employees
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_2015_revenue,avg_revenue,nonprofits,nonprofits_distinct,num_employees
0,2344355088288,7952843.0,294782,275077,787050


**Embed functions inside of other functions**

In [52]:
%%bigquery
SELECT 
    SUM(totrevenue) AS total_2015_revenue,
    ROUND(AVG(totrevenue),2) AS avg_revenue, ## we can round the average revenue
    COUNT(ein) AS nonprofits,
    COUNT(DISTINCT ein) AS nonprofits_distinct,
    MAX(noemplyeesw3cnt) AS num_employees
FROM 
    `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_2015_revenue,avg_revenue,nonprofits,nonprofits_distinct,num_employees
0,2344355088288,7952843.42,294782,275077,787050


In [56]:
%%bigquery
SELECT
    ein, # not aggregated
    COUNT(ein) AS ein_count # aggregated
FROM `bigquery-public-data.irs_990.irs_990_2015`
GROUP BY ein
ORDER BY ein_count DESC
# There are many charities that have more than one record for tax filing year 2015. This is highly unusual.
# Next let's count how often this happens in total.

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,ein_count
0,431859076,7
1,362235151,7
2,208367574,7
3,830345294,7
4,841604402,7
...,...,...
275072,222629185,1
275073,880275767,1
275074,150539118,1
275075,770057903,1


**Filter aggregation with HAVING clause**

- HAVING is very very useful when we're filtering aggregations.

In [58]:
%%bigquery
SELECT
    ein, # not aggregated
    COUNT(ein) AS ein_count # aggregated
FROM `bigquery-public-data.irs_990.irs_990_2015`
GROUP BY ein
HAVING ein_count > 1
ORDER BY ein_count DESC


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,ein_count
0,431859076,7
1,362235151,7
2,208367574,7
3,830345294,7
4,841604402,7
...,...,...
17989,860593601,2
17990,203502737,2
17991,942324340,2
17992,942662962,2


In [62]:
print(17994/275077*100)

6.541441123758076


**Explore further by filtering on one nonprofit**

- We have seven paper filings for one ein for 2015 which is the tax period of 2014 since you file your taxes a year after the actual tax period.
- So in the 2015 calendar year, we have this ein 2008 to 2014 filinf as well.
- It can be based human error or dirty data or organization is submitting more than one tax period filing.

In [64]:
%%bigquery
SELECT * 
FROM `bigquery-public-data.irs_990.irs_990_2015`
WHERE ein = '262152334'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,elf,tax_pd,subseccd,s501c3or4947a1cd,schdbind,politicalactvtscd,lbbyingactvtscd,subjto6033cd,dnradvisedfundscd,...,exceeds1pct509,subtotpub509,pubsupplesub509,samepubsuppsubtot509,grsinc509,unreltxincls511tx509,subtotsuppinc509,netincunrelatd509,othrinc509,totsupp509
0,262152334,P,201012,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
1,262152334,P,200812,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
2,262152334,P,201412,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
3,262152334,P,201312,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
4,262152334,P,201212,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
5,262152334,P,200912,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
6,262152334,P,201112,3,N,N,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
