In [1]:
# BigQuery Setup
# Importing Libraries and Credentials
import pandas as pd
import numpy as np
import seaborn as sns
from google.cloud import bigquery
from google.oauth2 import service_account
# ignore warnings
from warnings import filterwarnings
filterwarnings("ignore")


%load_ext google.cloud.bigquery

credentials = service_account.Credentials.from_service_account_file('/Users/ssamilozkan/Desktop/BigQuery/config.json')

project_id = 'dbt-bigquery-setup-369911'
client = bigquery.Client(credentials= credentials, project=project_id)

## Advanced functions (Statistical, analytic, user-defined)

In [12]:
%%bigquery
SELECT 
    STDDEV(noemplyeesw3cnt) AS std_dev_employee_count, 
    CORR(totprgmrevnue,totfuncexpns) AS corr_revenue_expenses
FROM `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,std_dev_employee_count,corr_revenue_expenses
0,1579.800536,0.97618


Try Approximate Agrregate Functions when 'close enough' will do

In [13]:
%%bigquery
SELECT 
  APPROX_COUNT_DISTINCT(ein) AS approx_ein_count,
  COUNT(ein) AS exact_ein_count
FROM `bigquery-public-data.irs_990.irs_990_2015`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,approx_ein_count,exact_ein_count
0,274482,294782


 But imagine you're in this scenario where you work at Google and you have to count all of the logs or all the user logins over a multi-year period of time. You're talking about petabytes of data and you don't have all the time in the world. You want to actually sacrifice a little bit of accuracy for speed in which you get your query back. Especially when you're talking about counting all the logs that are processed or counting all the ads that are served. Very popular to do that approximate count distinct. Let's look at an example here. 

In [15]:
%%bigquery
SELECT 
 CONCAT('20', _TABLE_SUFFIX) AS year,
 APPROX_COUNT_DISTINCT(actor.login) AS approx_cnt
FROM `githubarchive.year.20*`
GROUP BY year
ORDER BY year

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,year,approx_cnt
0,2011,540440
1,2012,1188211
2,2013,2208240
3,2014,3117587
4,2015,4440679
5,2016,6643627
6,2017,8392186
7,2018,9511968
8,2019,12027368
9,2020,15000288


In [25]:
%%bigquery
WITH github_year_sketches AS (
    SELECT 
        CONCAT('20', _TABLE_SUFFIX) AS year,
        APPROX_COUNT_DISTINCT(actor.login) AS approx_cnt,
        HLL_COUNT.INIT(actor.login) AS sketch #HyperLogLog Estimation
    FROM `githubarchive.year.20*`
    GROUP BY year
    ORDER BY year
)

SELECT HLL_COUNT.MERGE(sketch) AS approx_unique_users 
FROM `github_year_sketches`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,approx_unique_users
0,42910843


## Analytic Functions, WITH clause, and RANK()

- RANK() Function for aggregating over groups of rows

```
SELECT 
    firstname, 
    depatment,
    startdate,
    RANK() OVER (PARTITION BY department ORDER BY startdate) AS rank
FROM Employees
```

## BigQuery User-Defined Functions (UDFs)

- **CREATE FUNCTION** 

Create a new function. A function can contain zero or more named_paramethers

- **RETURNS[data type]** 

Specifies the data type of the return value of the function

- **LANGUAGE** 

Specifies the language of the function
- **AS[extarnal_code]** 

Specifies the code of the function runs

```

In [43]:
%%bigquery
CREATE TEMP FUNCTION AddFourAndDivide(x INT64, y INT64)
RETURNS FLOAT64
AS (
  (x + 4) / y
);

SELECT
  val, AddFourAndDivide(val, 2)
FROM
  UNNEST([2,3,5,8]) AS val;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,val,f0_
0,2,3.0
1,3,3.5
2,5,4.5
3,8,6.0


In [44]:
%%bigquery
CREATE TEMP FUNCTION addFourAndDivideAny(x ANY TYPE, y ANY TYPE)
AS (
  (x + 4) / y
);

SELECT
  addFourAndDivideAny(3, 4) AS integer_input,
  addFourAndDivideAny(1.59, 3.14) AS floating_point_input;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,integer_input,floating_point_input
0,1.75,1.780255


```
%%bigquery
CREATE FUNCTION d2i_demo.nlp.compromise_people(str STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS """
    return nlp(str).people().out('topk').map(x => x.normal);
"""
OPTIONS(library="gs://d2i-demo/nlp/compromise.js");

SELECT
    name, 
    COUNT(*)AS cnt
FROM (SELECT d2i_demo.nlp.compromise_people(title) AS names
      FROM `d2i-demo.reddit_posts`
      WHERE subreddit = 'movies'), UNNEST(names) AS name
WHERE name LIKE '% %' 
GROUP BY 1 
ORDER BY 2 DESC
LIMIT 10 

## Sub-query and CTE design

- WITH is simply a named subquery or Common Table Expression (CTE)
- Acts as temporary table
- Breaks up complex queries 
- Chain together multiple subqueries in a single WITH
- You can reference other subqueries in future subqueries


In [37]:
%%bigquery
WITH 
    irs_990_2015_ein AS (
        SELECT *
        FROM `bigquery-public-data.irs_990.irs_990_2015`
        JOIN `bigquery-public-data.irs_990.irs_990_ein` USING (ein)
    ),

    duplicates AS (
        SELECT ein AS ein, COUNT(ein) AS ein_count
        FROM irs_990_2015_ein
        GROUP BY ein
        HAVING ein_count > 1
    )

SELECT 
    irs_990.ein AS ein,
    irs_990.name AS name,
FROM 
    irs_990_2015_ein AS irs_990
LEFT JOIN 
    duplicates
ON irs_990.ein = duplicates.ein
WHERE duplicates.ein IS NULL
LIMIT 10


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ein,name
0,134202729,WORLDCLASS SCHOOLS OF LEON COUNTY INC
1,161558160,CHINESE BIOLOGICAL INVESTIGATORS SOCIETY INC
2,200733852,LA ACADEMIA DE ESTRELLAS
3,200813566,DAVIS PHINNEY FOUNDATION
4,201033832,CALVARY FAITH CHURCH
5,201858456,HOPE ONLINE LEARNING ACADEMY CO-OP
6,201891947,LANTERI CENTER FOR IGNATIAN SPIRITUALITY
7,202509473,STELLA SCHOLA PTO
8,203557068,USA PICKLEBALL ASSOCIATION
9,205196010,DREAM IN GREEN INC


## Exercises

In [42]:
%%bigquery
WITH summary AS (
# count of filings, revenue, expenses since 2013
SELECT
  CONCAT("20",_TABLE_SUFFIX) AS year_filed,
  COUNT(ein) AS nonprofit_count,
  AVG(totrevenue) AS avg_revenue,
  AVG(totfuncexpns) AS avg_expenses
FROM `bigquery-public-data.irs_990.irs_990_20*`
WHERE _TABLE_SUFFIX >= '13'
GROUP BY year_filed
ORDER BY year_filed DESC
)

SELECT 
*, ROUND(avg_revenue - avg_expenses, 2) AS avg_income  
FROM summary
ORDER BY avg_income DESC


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,year_filed,nonprofit_count,avg_revenue,avg_expenses,avg_income
0,2015,294782,7952843.0,7411629.0,541214.61
1,2014,299405,7515041.0,6979379.0,535662.61
2,2016,307483,7938932.0,7446167.0,492765.15
3,2017,300910,8316694.0,7931376.0,385318.4
4,2013,289603,7419203.0,7045596.0,373606.74


In [45]:
%%bigquery
SELECT
    ['apple', 'banana', 'cherry', 'elderberry'] AS fruits,
    'Jacob' AS customer,

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fruits,customer
0,"[apple, banana, cherry, elderberry]",Jacob


In [51]:
%%bigquery
SELECT
items,
customer_name
FROM 
UNNEST(['apple', 'banana', 'cherry', 'elderberry']) AS items
    CROSS JOIN
    (SELECT 'Jacob' AS customer_name)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,items,customer_name
0,apple,Jacob
1,banana,Jacob
2,cherry,Jacob
3,elderberry,Jacob
