In [1]:
import pandas as pd
from google.cloud import bigquery

# 1. Copying USPatents table

In [None]:
# from google.cloud import bigquery
client = bigquery.Client()
# Defining source_table
source_dataset = client.dataset('results', project='usptobias')
source_table_ref = source_dataset.table('USPatents')
# Defining target table
target_dataset = client.dataset('results_docdb', project='usptobias')
dest_table_ref = target_dataset.table('USPatents')
# Initializing the Job
job = client.copy_table(
    source_table_ref,
    dest_table_ref,
    # Location must match that of the source and destination tables.
    location='US')  # API request

# Running the job
job.result()  # Waits for job to complete.

## 1.2. Deleting the table

In [87]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USPatents'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results_docdb:USPatents deleted.


# 2. USOnDocdb: USPatents Join on Docdb with tls201

In [136]:
client = bigquery.Client()

## 2.1. First Query: Generating merged table

In [137]:
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set the destination table
dataset_id = 'results_docdb'
table_id = 'USOnDocdb'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [138]:
sql = """
    SELECT DISTINCT
            t1.appln_id AS appln_id_1,
            t1.appln_nr AS appln_nr_1,
            t1.appln_auth AS appln_auth_1,
            t1.appln_filing_year AS appln_filing_year_1,
            t1.appln_kind AS appln_kind_1,
            t2.appln_id AS appln_id_2,
            t2.appln_nr AS appln_nr_2,
            t2.appln_auth AS appln_auth_2,
            t2.appln_filing_year AS appln_filing_year_2,
            t2.appln_kind AS appln_kind_2,
            t1.docdb_family_id AS docdb_family_id
    FROM
            `usptobias.results_docdb.USPatents` AS t1
        LEFT JOIN
            `usptobias.patstat.tls201` AS t2
        ON
            t1.docdb_family_id=t2.docdb_family_id
    WHERE
            t1.appln_id<>t2.appln_id
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

#            AND t2.appln_kind = 'A '
#            AND t2.appln_filing_year BETWEEN 2001 AND 2018

In [139]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fcb3e3d60f0>

## 2.2. Getting Information of USOnInpadoc table

In [140]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnDocdb'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ())]
None

There are 22,220,606 number of records in total!


In [111]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnDocdb'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ())]
None

There are 20,910,144 number of records in total!


## 2.3. Second Query: Counting Number of Pairs

In [141]:
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False
# Set configuration.query.createDisposition
job_config.create_disposition = 'CREATE_IF_NEEDED'
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set the destination table
dataset_id = 'results_docdb'
table_id = 'USOnDocdb2'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [142]:
# Number of pairs
sql = """
    SELECT
            t1.*,
            t2.docdb_nb_pairs
    FROM
            `usptobias.results_docdb.USOnDocdb` AS t1
        LEFT JOIN
            (
            SELECT
                    docdb_family_id,
                    appln_auth_2,
                    COUNT(*) AS docdb_nb_pairs
            FROM
                    `usptobias.results_docdb.USOnDocdb`
            GROUP BY
                    docdb_family_id, appln_auth_2
            ) AS t2
        ON
            (t1.docdb_family_id = t2.docdb_family_id) AND (t1.appln_auth_2 = t2.appln_auth_2)
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

In [143]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fcb3e338860>

## 2.4. Getting the information of "USOnInpadoc2" table

In [144]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnDocdb2'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_nb_pairs', 'INTEGER', 'NULLABLE', None, ())]
None

There are 22,220,606 number of records in total!


In [115]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnDocdb2'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_nb_pairs', 'INTEGER', 'NULLABLE', None, ())]
None

There are 20,910,144 number of records in total!


## 2.5. Loading 100 rows from the table

In [145]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnDocdb2'

dataset_ref = client.dataset(dataset_id, project="usptobias")
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API call

In [146]:
## Getting the first 100 lines of codes
# Load the first 10 rows
rows = client.list_rows(table, max_results=10000)

#pd.DataFrame(rows)
df = rows.to_dataframe()

In [147]:
df.head(7)

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_kind_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,appln_kind_2,docdb_family_id,docdb_nb_pairs
0,497186644,201815918347,US,2018,A,487001619,201131320,SI,2011,T,44258572,36
1,496643051,201815876086,US,2018,A,424503433,7001091,PT,2001,T,22818451,27
2,496644399,201815900153,US,2018,A,489171062,201790834,EA,2015,A,54347916,2
3,494656533,201815864508,US,2018,A,457673554,201600145,EA,2014,A,49167101,2
4,496656318,201815903284,US,2018,A,478488108,20175122,FI,2017,A,48672303,28
5,494035325,201815867425,US,2018,A,329259103,6202007,PE,2007,A,36687588,5
6,496659776,201815907643,US,2018,A,404884016,3741520,PT,2003,T,31721895,60


# 2.6. Deleting the table

In [48]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnDocdb2'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results_docdb:USOnDocdb2 deleted.


# 3. Exctrating exact_twins

In [148]:
client = bigquery.Client()

## 3.1. Query to generate the table

In [151]:
job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set destination table
dataset_id = 'results_docdb'
table_id = 'USExactTwins'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [152]:
sql= """
    SELECT *
    FROM (
        SELECT
                *,
                (
                CASE 
                        WHEN docdb_nb_pairs=1 THEN 1
                        ELSE 0
                END
                ) AS exact_twins
        FROM
                `usptobias.results_docdb.USOnDocdb2`
         )
    WHERE
            appln_auth_2<>'US'
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [153]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fcb3e2a8780>

## 3.2. Getting the table information

In [154]:
dataset_id = 'results_docdb'
table_id = 'USExactTwins'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_nb_pairs', 'INTEGER', 'NULLABLE', None, ()), SchemaField('exact_twins', 'INTEGER', 'NULLABLE', None, ())]
None

There are 17,047,494 number of records in total!


## 3.3. Loading the first 1000 rows of the table

In [155]:
dataset_id = 'results_docdb'
table_id = 'USExactTwins'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [156]:
df.head(7)

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_kind_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,appln_kind_2,docdb_family_id,docdb_nb_pairs,exact_twins
0,50977805,44980706,US,2006,A,57634876,200600547,RO,1997,A,25663046,196,0
1,468635376,201615150046,US,2016,A,496007363,P20180689,HR,2018,T,43033144,15,0
2,49292888,28426005,US,2005,A,42016930,913271,NO,1991,A,10668300,45,0
3,54165473,92347404,US,2004,A,328365175,38450498,PL,1998,A,26748211,78,0
4,46815206,7335702,US,2002,A,471125,200302729,AP,1998,A,25586362,15,0
5,48324768,16650602,US,2002,A,42454541,34595599,PL,1999,A,22256669,11,0
6,48441609,18060802,US,2002,A,378795381,00022012,SK,1999,A,26046070,20,0


# 3.4. Deleting the table

In [56]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USExactTwins'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results_docdb:USExactTwins deleted.


# 4. Adding Priority filing information

# 4.1. Query

In [3]:
client = bigquery.Client()

job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set destination table
dataset_id = 'results_docdb'
table_id = 'backbone_0'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [4]:
sql= """
    SELECT
            *,
            (
            CASE 
                    WHEN appln_id_1 NOT IN(
                                    SELECT DISTINCT appln_id
                                    FROM `usptobias.patstat.tls204`) THEN 1
                    ELSE 0
            END
            ) AS is_1_pf,
            (
            CASE 
                    WHEN appln_id_2 NOT IN(
                                    SELECT DISTINCT appln_id
                                    FROM `usptobias.patstat.tls204`) THEN 1
                    ELSE 0
            END
            ) AS is_2_pf
    FROM
            `usptobias.results_docdb.USExactTwins`
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [5]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fed83ba9748>

# 4.2. Getting the resulting table

In [6]:
dataset_id = 'results_docdb'
table_id = 'backbone_0'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_nb_pairs', 'INTEGER', 'NULLABLE', None, ()), SchemaField('exact_twins', 'INTEGER', 'NULLABLE', None, ()), SchemaField('is_1_pf', 'INTEGER', 'NULLABLE', None, ()), SchemaField('is_2_pf', 'INTEGER', 'NULLABLE', None, ())]
None

There are 17,047,494 number of records in total!


In [10]:
# Loading the first 10,000 rows
dataset_id = 'results_docdb'
table_id = 'backbone_0'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [11]:
df.head()

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_kind_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,appln_kind_2,docdb_family_id,docdb_nb_pairs,exact_twins,is_1_pf,is_2_pf
0,330388621,83467310,US,2010,A,497159087,181100283,CY,2018,T,42670422,6,0,0,0
1,54407623,97698007,US,2007,A,15585641,52497,EG,1997,A,25663046,49,0,0,0
2,50244105,36941006,US,2006,A,43705482,75399,SK,1997,A,26708180,3,0,0,0
3,49711400,33597803,US,2003,A,42399621,28759290,PL,1990,A,23706597,3,0,1,0
4,420522490,201214238569,US,2012,A,497159007,181100119,CY,2018,T,47715182,1,1,0,0


# 4.3. Exporting the table

In [15]:
# from google.cloud import bigquery
client = bigquery.Client()
bucket_name = 'patent-results'
folder_name = 'backbone'

project = 'usptobias'
dataset_id = 'results_docdb'
table_id = 'backbone_0'

destination_uri = 'gs://{}/{}/{}'.format(bucket_name, folder_name, 'backbone_0_docdb_*.csv.gz')
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

In [16]:
job_config = bigquery.job.ExtractJobConfig()

job_config.compression = "GZIP"
job_config.destinationFormat = "CSV"

In [17]:
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location='US',
    job_config=job_config)  # API request
extract_job.result()  # Waits for job to complete.
7
print('Exported {}:{}.{} to {}'.format(
    project, dataset_id, table_id, destination_uri))

Exported usptobias:results_docdb.backbone_0 to gs://patent-results/backbone/backbone_0_docdb_*.csv.gz


# 5. Selecting Only "Exact Twins"

In [157]:
client = bigquery.Client()

## 5.1. Query to generate the table

In [158]:
job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False
# Set configuration.query.createDisposition
job_config.create_disposition = 'CREATE_IF_NEEDED'
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE' #'WRITE_APPEND'


# Set destination table
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [159]:
sql= """
    SELECT
            * EXCEPT (exact_twins, docdb_nb_pairs)
    FROM
            `usptobias.results_docdb.USExactTwins`
    WHERE
            exact_twins=1
            AND appln_kind_2='A '
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [160]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fcb3e27d4a8>

## 5.2. Getting the resulting table info

In [130]:
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_kind_2', 'STRING', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ())]
None

There are 6,421,665 number of records in total!


## 5.3. Loading the First 10,000 rows

In [131]:
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [132]:
df.head(7)

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_kind_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,appln_kind_2,docdb_family_id
0,496058741,201815896364,US,2018,A,496213854,2018000422,CO,2018,A,53938196
1,497502738,201815933769,US,2018,A,489782299,24242017,PE,2016,A,54199040
2,497501486,201815876721,US,2018,A,487512831,500532017,AT,2017,A,60628627
3,497183390,201815867032,US,2018,A,493142727,201730062,ES,2017,A,61192831
4,496926825,201815913737,US,2018,A,477136253,1551136,SE,2015,A,58239223
5,496060625,201815897495,US,2018,A,497230395,2018007120,CO,2018,A,59499674
6,494027368,201815864756,US,2018,A,477018110,2015096,NL,2015,A,56740433


## 5.4.Deleting the table

In [113]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results_docdb:USOnlyTwins deleted.


## 5.5. Exporting Table to GCP Bucket

In [133]:
# from google.cloud import bigquery
client = bigquery.Client()
bucket_name = 'patent-results'
folder_name = 'exactTwins'

project = 'usptobias'
dataset_id = 'results_docdb'
table_id = 'USExactTwins'

destination_uri = 'gs://{}/{}/{}'.format(bucket_name, folder_name, 'exact_twins_docdb-*.csv.gz')
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

In [134]:
job_config = bigquery.job.ExtractJobConfig()

job_config.compression = "GZIP"
job_config.destinationFormat = "CSV"

In [135]:
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location='US',
    job_config=job_config)  # API request
extract_job.result()  # Waits for job to complete.
7
print('Exported {}:{}.{} to {}'.format(
    project, dataset_id, table_id, destination_uri))

Exported usptobias:results_docdb.USExactTwins to gs://patent-results/exactTwins/exact_twins_docdb-*.csv.gz


# 6.Filtering to get patents which filed in years between 2001 and 2017

## 6.1. Query

In [123]:
client = bigquery.Client()

job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False

# Set destination table
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins_2017'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [126]:
sql= """
    SELECT
            *
    FROM
            `usptobias.results_docdb.USOnlyTwins`
    WHERE
            appln_filing_year_1 <= 2017
            AND appln_filing_year_2 <= 2017
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [127]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7faa222c35f8>

## 6.2. Getting the resulting table information

In [128]:
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins_2017'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ())]
None

There are 6,486,324 number of records in total!


## 6.3. Loading the first 10,000 rows

In [131]:
dataset_id = 'results_docdb'
table_id = 'USOnlyTwins_2017'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [132]:
df.head()

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,docdb_family_id
0,52719654,68135303,US,2003,45116794,90119884,TW,2001,24562341
1,53556099,80503801,US,2001,37204938,2001007990,JP,2001,18875655
2,54434010,98412701,US,2001,43670917,200106783,SG,2001,18928048
3,53426286,78297301,US,2001,4744216,2376551,CA,2001,26892690
4,48298906,16345702,US,2002,21354831,120717,GB,2001,9921005


# 7. Querying For Comparing

## 7.1. Copying FamilyInfo table (df_twins_1)

In [None]:
'''# from google.cloud import bigquery
client = bigquery.Client()
# Defining source_table
source_dataset = client.dataset('results', project='usptobias')
source_table_ref = source_dataset.table('familyInfo')
# Defining target table
target_dataset = client.dataset('results_docdb', project='usptobias')
dest_table_ref = target_dataset.table('familyInfo')
# Initializing the Job
job = client.copy_table(
    source_table_ref,
    dest_table_ref,
    # Location must match that of the source and destination tables.
    location='US')  # API request

# Running the job
job.result()  # Waits for job to complete.'''

## 7.2. Getting the rows that are not mutual

In [44]:
client = bigquery.Client()

job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False

# Set destination table
dataset_id = 'results_docdb'
table_id = 'not_mutual'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [45]:
sql = """
    SELECT 
            appln_id_1,
            appln_id_2,
            t1.is_in_1,
            t2.is_in_2,
            exact_twins_1,
            exact_twins_2
            
    FROM
            (
            SELECT
                    appln_id_1,
                    appln_id_2,
                    exact_twins AS exact_twins_1,
                    1 AS is_in_1
            FROM
                    `usptobias.results_docdb.familyInformation`
            ) AS t1
            
        FULL OUTER JOIN
            (
            SELECT
                    appln_id_1,
                    appln_id_2,
                    exact_twins AS exact_twins_2,
                    1 AS is_in_2
            FROM
                    `usptobias.results_docdb.USExactTwins`
            WHERE
                    appln_filing_year_1 <> 2018
                    AND appln_filing_year_2 <> 2018
            ) AS t2

        USING(appln_id_1, appln_id_2)
    WHERE
            (t1.is_in_1 IS NULL OR t2.is_in_2 IS NULL)
            
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

In [46]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f62a448af60>

## 7.3. Getting the resulting table

In [47]:
dataset_id = 'results_docdb'
table_id = 'not_mutual'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('is_in_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('is_in_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('exact_twins_1', 'STRING', 'NULLABLE', None, ()), SchemaField('exact_twins_2', 'INTEGER', 'NULLABLE', None, ())]
None

There are 4,472,433 number of records in total!


In [48]:
dataset_id = 'results_docdb'
table_id = 'not_mutual'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [49]:
df.head(7)

Unnamed: 0,appln_id_1,appln_id_2,is_in_1,is_in_2,exact_twins_1,exact_twins_2
0,475835413,9045699,,1,,0
1,328061117,494263670,,1,,0
2,51518973,478536366,,1,,0
3,52185782,474827605,,1,,0
4,478168785,274024026,,1,,0
5,480414918,8304573,,1,,0
6,481963848,339044681,,1,,0


### 7.3.1. Deleting table

In [43]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results_docdb'
table_id = 'not_mutual_info'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results_docdb:not_mutual_info deleted.


## 7.4. Adding the information for appln_id_1 and appln_id_2

In [50]:
client = bigquery.Client()

job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False

# Set destination table
dataset_id = 'results_docdb'
table_id = 'not_mutual_info'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [51]:
sql = """
    SELECT 
            t1.*,
            t2.appln_auth_1,
            t2.appln_kind_1,
            t2.appln_filing_year_1,
            t3.appln_auth_2,
            t3.appln_kind_2,
            t3.appln_filing_year_2,
            t2.docdb_family_id,
            t2.docdb_family_size
            
    FROM
            `usptobias.results_docdb.not_mutual` AS t1
    LEFT JOIN
            (
            SELECT
                    appln_id,
                    appln_auth AS appln_auth_1,
                    appln_kind AS appln_kind_1,
                    appln_filing_year AS appln_filing_year_1,
                    docdb_family_id,
                    docdb_family_size
            FROM
                    `usptobias.patstat.tls201`
            ) AS t2
            ON t1.appln_id_1 = t2.appln_id
    LEFT JOIN
            (
            SELECT
                    appln_id,
                    appln_auth AS appln_auth_2,
                    appln_kind AS appln_kind_2,
                    appln_filing_year AS appln_filing_year_2
            FROM
                    `usptobias.patstat.tls201`
            ) AS t3
            ON t1.appln_id_2 = t3.appln_id
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

In [52]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f62a3fb5ef0>

## 7.4. Getting the resulting table: "not_mutual_info"

In [53]:
dataset_id = 'results_docdb'
table_id = 'not_mutual_info'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [54]:
df.head(7)

Unnamed: 0,appln_id_1,appln_id_2,is_in_1,is_in_2,exact_twins_1,exact_twins_2,appln_auth_1,appln_kind_1,appln_filing_year_1,appln_auth_2,appln_kind_2,appln_filing_year_2,docdb_family_id,docdb_family_size
0,53183756,43721146,1,,0,,US,A,2007.0,SK,A,1993.0,4230929.0,44.0
1,48020590,42184007,1,,0,,US,A,2008.0,NZ,A,1993.0,25536845.0,255.0
2,332950035,42175701,1,,0,,US,A,2010.0,NZ,A,1994.0,22245764.0,110.0
3,49164078,405869250,1,,0,,US,A,2002.0,JO,A,1994.0,3505998.0,19.0
4,54173955,42175701,1,,0,,US,A,2001.0,NZ,A,1994.0,22245764.0,110.0
5,54219757,42021837,1,,0,,US,A,2004.0,NO,A,1992.0,6399399.0,32.0
6,54191354,3408454,1,,0,,US,A,2001.0,,,,21779630.0,26.0


## 7.5. Exporting the table to GCP Bucket

In [55]:
# from google.cloud import bigquery
client = bigquery.Client()
bucket_name = 'patent-results'
folder_name = 'exactTwins'

project = 'usptobias'
dataset_id = 'results_docdb'
table_id = 'not_mutual_info'

destination_uri = 'gs://{}/{}/{}'.format(bucket_name, folder_name, 'not_mutual.csv.gz')
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)


job_config = bigquery.job.ExtractJobConfig()

job_config.compression = "GZIP"
job_config.destinationFormat = "CSV"

In [56]:
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location='US',
    job_config=job_config)  # API request
extract_job.result()  # Waits for job to complete.
7
print('Exported {}:{}.{} to {}'.format(
    project, dataset_id, table_id, destination_uri))

Exported usptobias:results_docdb.not_mutual_info to gs://patent-results/exactTwins/not_mutual.csv.gz
