# Joins with BigQuery

## Inner Joins

Inner Joins return data where there are matching records in both tables.  

[Join documentation on BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#join_types)

[Kaggle Inner Joins](https://www.kaggle.com/dansbecker/joining-data)  
[Kaggle Outter Joins](https://www.kaggle.com/alexisbcook/joins-and-unions)

Inner and Outer Joins Venn Diagram https://realpython.com/pandas-merge-join-and-concat/ about halfway down

In [None]:
from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [None]:
billing_project_id = 'cool-monolith-286222'

# Create client object
client = bigquery.Client(project=billing_project_id)

In [None]:
dataset_ref = client.dataset("github_repos", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "licenses" table
licenses_ref = dataset_ref.table("licenses")

# API request - fetch the table
licenses_table = client.get_table(licenses_ref)

# Preview the first five lines of the "licenses" table
client.list_rows(licenses_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,license
0,autarch/Dist-Zilla-Plugin-Test-TidyAll,artistic-2.0
1,thundergnat/Prime-Factor,artistic-2.0
2,kusha-b-k/Turabian_Engin_Fan,artistic-2.0
3,onlinepremiumoutlet/onlinepremiumoutlet.github.io,artistic-2.0
4,huangyuanlove/LiaoBa_Service,artistic-2.0


In [None]:
# Construct a reference to the "sample_files" table
files_ref = dataset_ref.table("sample_files")

# API request - fetch the table
files_table = client.get_table(files_ref)

# Preview the first five lines of the "sample_files" table
client.list_rows(files_table, max_results=5).to_dataframe()


Unnamed: 0,repo_name,ref,path,mode,id,symlink_target
0,EOL/eol,refs/heads/master,generate/vendor/railties,40960,0338c33fb3fda57db9e812ac7de969317cad4959,/usr/share/rails-ruby1.8/railties
1,np/ling,refs/heads/master,tests/success/merger_seq_inferred.t/merger_seq...,40960,dd4bb3d5ecabe5044d3fa5a36e0a9bf7ca878209,../../../fixtures/all/merger_seq_inferred.ll
2,np/ling,refs/heads/master,fixtures/sequence/lettype.ll,40960,8fdf536def2633116d65b92b3b9257bcf06e3e45,../all/lettype.ll
3,np/ling,refs/heads/master,fixtures/failure/wrong_order_seq3.ll,40960,c2509ae1196c4bb79d7e60a3d679488ca4a753e9,../all/wrong_order_seq3.ll
4,np/ling,refs/heads/master,issues/sequence/keep.t,40960,5721de3488fb32745dfc11ec482e5dd0331fecaf,../keep.t


In [None]:
# Add safe config settings
ONE_MB = 1000*1000
TWO_GB = 2*1000*ONE_MB
SIX_GB = 6*1000*ONE_MB


In [None]:
query = """
        SELECT files.repo_name, licenses.license, files.path
        FROM `bigquery-public-data.github_repos.sample_files` AS files
        INNER JOIN `bigquery-public-data.github_repos.licenses` AS licenses
          ON files.repo_name = licenses.repo_name
        LIMIT 20
        """
# safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=TWO_GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=SIX_GB)
licenses = client.query(query, job_config=safe_config).to_dataframe()
licenses

Unnamed: 0,repo_name,license,path
0,Piicksarn/cdnjs,mit,ajax/libs/ace/1.1.7/ext-static_highlight.js
1,Piicksarn/cdnjs,mit,ajax/libs/material-design-icons/1.0.0/device/d...
2,Piicksarn/cdnjs,mit,ajax/libs/mathjax/2.6.1-rc.1/fonts/HTML-CSS/Te...
3,Piicksarn/cdnjs,mit,ajax/libs/yui/3.8.0pr1/resize-proxy/resize-pro...
4,Piicksarn/cdnjs,mit,ajax/libs/material-design-icons/2.1.3/device/i...
5,Piicksarn/cdnjs,mit,ajax/libs/material-design-icons/2.1.3/content/...
6,Piicksarn/cdnjs,mit,ajax/libs/twemoji/1.2.1/36x36/1f4e6.png
7,Piicksarn/cdnjs,mit,ajax/libs/material-design-icons/2.1/image/svg/...
8,Piicksarn/cdnjs,mit,ajax/libs/angular-i18n/1.2.26/angular-locale_a...
9,Piicksarn/cdnjs,mit,ajax/libs/fullPage.js/2.5.0/jquery.fullPage.js


## Outer Joins


In [None]:
# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

tables = client.list_tables(dataset)
for table in tables:
  print(table.table_id)



full


In [None]:

# Construct a reference to the "full" table
table_ref = dataset_ref.table("full")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the table
full = client.list_rows(table, max_results=5).to_dataframe()
full

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,,,,,,NaT,story,9904684,,,,
1,,,,,,,1437161812.0,2015-07-17 19:36:52+00:00,story,9904796,,,,True
2,,,,True,,,1437163755.0,2015-07-17 20:09:15+00:00,story,9904999,,,,True
3,,,,,,,,NaT,story,9905071,,,,
4,,,,,,,1437165877.0,2015-07-17 20:44:37+00:00,story,9905201,,,,True


In [None]:
full[:2]

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,,,,,,NaT,story,9904684,,,,
1,,,,,,,1437161812.0,2015-07-17 19:36:52+00:00,story,9904796,,,,True


In [None]:
query = """
  SELECT
    parent AS story_id,
    title,
    url,

    by,
    time,
  FROM
    `bigquery-public-data.hacker_news.full` AS stories
  WHERE
    EXTRACT(DATE FROM stories.time_ts) = '2012-01-01'
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=TWO_GB)
comment_counts = client.query(query, job_config=safe_config).to_dataframe()
comment_counts

BadRequest: 400 Syntax error: Expected end of input but got keyword BY at [7:5]; reason: invalidQuery, location: query, message: Syntax error: Expected end of input but got keyword BY at [7:5]

Location: US
Job ID: 98ab5dea-8d74-48b8-889f-29739d08ad3e


In [None]:
full.shape

In [None]:
full.columns


In [None]:
# Construct a reference to the "stories" table
table_ref = dataset_ref.table("stories")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the table
stories = client.list_rows(table, max_results=5).to_dataframe()
stories

In [None]:
stories.shape


In [None]:
stories.columns


In [None]:
query = """
        SELECT stories.id AS story_id, stories.by, stories.title, comments.ranking, comments.id
        FROM `bigquery-public-data.hacker_news.stories` AS stories
        LEFT JOIN `bigquery-public-data.hacker_news.comments` AS comments
        ON stories.id = comments.parent
        WHERE EXTRACT(DATE FROM stories.time_ts) = '2012-01-01'
        """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=TWO_GB)
comment_counts = client.query(query, job_config=safe_config).to_dataframe()
comment_counts

In [None]:
1-comment_counts["ranking"].isnull().mean()


## Normalize table

In [None]:
query = """
        SELECT
          stories.id AS story_id,
          stories.by,
          stories.title,
          comments.ranking,
          comments.id
        FROM
          `bigquery-public-data.hacker_news.stories` AS stories
        LEFT JOIN
          `bigquery-public-data.hacker_news.comments` AS comments
        ON
          stories.id = comments.parent
        WHERE
          EXTRACT(DATE FROM stories.time_ts) = '2012-01-01'
        """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=TWO_GB)
comment_counts = client.query(query, job_config=safe_config).to_dataframe()
comment_counts

In [None]:
full[:2]

In [None]:
query = """
        SELECT
          parent,
          title,
          id
        FROM
          `bigquery-public-data.hacker_news.full`
        WHERE
          parent = 363 and title <> "None"
        ORDER BY
          id
        LIMIT 20
"""

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=TWO_GB)
comment_counts = client.query(query, job_config=safe_config).to_dataframe()
comment_counts