### Select, From & Where

In [1]:
# Pointing the json key file of google cloud service account to local copy
import os

#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='******.json'

In [2]:
# Import google cloud client library
from google.cloud import bigquery
import pandas as pd
 
# Instantiates a client object for BigQuery Service
client = bigquery.Client()

In [3]:
# Query to select all the items from the "city" column where the "country" column is 'US'
query = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

# API Request
query_data = client.query(query)

# Turn the data into a Pandas DataFrame
df = query_data.to_dataframe()

# Display the results
df.head()

Unnamed: 0,city
0,BROWN
1,BROWN
2,BROWN
3,BROWN
4,Houston


In [4]:
# What five cities have the most measurements?

df.city.value_counts().head()

Phoenix-Mesa-Scottsdale                     88
Houston                                     82
Los Angeles-Long Beach-Santa Ana            68
Riverside-San Bernardino-Ontario            60
New York-Northern New Jersey-Long Island    60
Name: city, dtype: int64

In [5]:
# Query to get the score column from every row where the type column has value "job"
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 446879689 bytes.


In [6]:
# Only run the query if it's less than 1 MB
ONE_MB = 1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()

Unnamed: 0,score,title
0,1.0,Y Combinator Is Hiring a Batch Director
1,1.0,BillForward (YC S14) Is Hiring Developers and ...
2,1.0,70M Jobs (YC S17) Seeking Head of Operations
3,1.0,Disqus (SF) is seeking an inhouse technical re...
4,1.0,SoundFocus (YC S13) is hiring an Industrial De...
...,...,...
13569,5.0,Mixpanel - Looking for Python / Javascript hac...
13570,5.0,Mertado (YC W10) Looking for awesome Engineers...
13571,5.0,Love travel? Awesome at UI/UX design? Adioso (...
13572,6.0,Airbnb - Fraud Management Engineer


In [7]:


# Only run the query if it's less than 1 GB
ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_GB)

# Set up the query (will only run if it's less than 1 GB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
job_post_scores = safe_query_job.to_dataframe()

# Print average score for job posts
job_post_scores.score.mean()



1.8527758527758529

### **EXERCISE**

#### **1] Introduction to the DataSet **

In [8]:
from google.cloud import bigquery

# Create a 'Client' object
client = bigquery.Client()

# Construct a reference to the 'openaq' dataset
dataset_ref = client.dataset('openaq', project='bigquery-public-data')

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the 'global_air_quality' table
table_ref = dataset_ref.table('global_air_quality')

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the 'global_air_quality' table
client.list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,co,910.0,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
1,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,no2,131.87,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
2,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,o3,15.57,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
3,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,pm25,45.62,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
4,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,so2,4.49,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25


#### 2] Units of Measurement

* Which countries have reported pollution levels in units of "ppm"? In the code cell below, set first_query to an SQL query that pulls the appropriate entries from the country column.

In [12]:
# Query to select countries with units of "ppm"
first_query = """
        SELECT DISTINCT country 
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE unit = 'ppm'
        """

# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
first_query_job = client.query(first_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
first_results = first_query_job.to_dataframe()

# View top few rows of results
first_results.head()

Unnamed: 0,country
0,US
1,CL
2,AU
3,BM
4,MX


#### 3] High Air Quality
* Which pollution levels were reported to be exactly 0? 

In [14]:
# Query to select all columns where pollution levels are exactly 0
zero_pollution_query = """
                        SELECT * 
                        FROM `bigquery-public-data.openaq.global_air_quality`
                        WHERE value = 0
                       """

# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(zero_pollution_query, job_config=safe_config)

# API request - run the query and return a pandas DataFrame
zero_pollution_results = query_job.to_dataframe() 

zero_pollution_results.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,Victoria Memorial - WBSPCB,Kolkata,IN,pm25,0.0,2017-10-16 20:45:00+00:00,µg/m³,CPCB,22.572645,88.36389,0.25
1,"Rabindra Bharati University, Kolkata - WBSPCB",Kolkata,IN,so2,0.0,2017-10-28 14:30:00+00:00,µg/m³,CPCB,22.627874,88.3804,0.25
2,Zamość ul. Hrubieszowska 69A,Zamość,PL,no2,0.0,2020-05-19 05:00:00+00:00,µg/m³,GIOS,50.71663,23.290247,
3,"Końskie, MOBILNA",Końskie,PL,pm10,0.0,2018-12-21 13:00:00+00:00,µg/m³,GIOS,51.189526,20.408892,
4,"Końskie, MOBILNA",Końskie,PL,pm25,0.0,2018-12-21 13:00:00+00:00,µg/m³,GIOS,51.189526,20.408892,
