In [1]:
import pandas as pd
from google.cloud import bigquery

# 1.Creating USPatents table

In [218]:
client = bigquery.Client()

## 1.1. Query from tls201

In [219]:
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False

# Set the destination table
dataset_id = 'results'
table_id = 'USPatents'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [220]:
sql = """
    SELECT
            appln_id,
            appln_nr,
            appln_auth,
            appln_kind,
            appln_filing_year,
            docdb_family_id,
            inpadoc_family_id,
            docdb_family_size
    FROM
            `usptobias.patstat.tls201`
    WHERE
            appln_auth = 'US'
            AND appln_filing_year BETWEEN 2001 AND 2018
            AND appln_kind = 'A '
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

In [None]:
# A dry run query completes immediately.
assert query_job.state == 'DONE'
assert query_job.dry_run

In [125]:
print("This query will process {:.2f} Gb.".format(
    query_job.total_bytes_processed/1e9))

This query will process 3.09 Gb.


In [222]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f863ed7bf98>

## 1.2. Getting tables information

In [223]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USPatents'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_kind', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('docdb_family_size', 'INTEGER', 'NULLABLE', None, ())]
None

There are 5,721,874 number of records in total!


## 1.3. Loading 100 rows from the table

In [224]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USPatents'

dataset_ref = client.dataset(dataset_id, project="usptobias")
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API call

In [225]:
## Getting the first 100 lines of codes
# Load the first 10 rows
rows = client.list_rows(table, max_results=10000)

#pd.DataFrame(rows)
df = rows.to_dataframe()

df.head()

Unnamed: 0,appln_id,appln_nr,appln_auth,appln_kind,appln_filing_year,docdb_family_id,inpadoc_family_id,docdb_family_size
0,494272221,201815867977,US,A,2018,48678198,9178046,24
1,494672019,201815874000,US,A,2018,47988734,9390840,20
2,494916727,201815877768,US,A,2018,44857282,8440040,16
3,495177195,201815885038,US,A,2018,35516459,1601313,17
4,495179059,201815875308,US,A,2018,38123420,1601941,31


## Deleting a table

In [217]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USPatents'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results:USPatents deleted.


# 2. USOnInpadoc: USPatents Join on Inpadoc with tls201

In [226]:
client = bigquery.Client()

## 2.1. First Query: Generating merged table

In [232]:
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False

# Set the destination table
dataset_id = 'results'
table_id = 'USOnInpadoc'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [233]:
sql = """
    SELECT DISTINCT
            t1.appln_id AS appln_id_1,
            t1.appln_nr AS appln_nr_1,
            t1.appln_auth AS appln_auth_1,
            t1.appln_filing_year AS appln_filing_year_1,
            t2.appln_id AS appln_id_2,
            t2.appln_nr AS appln_nr_2,
            t2.appln_auth AS appln_auth_2,
            t2.appln_filing_year AS appln_filing_year_2,
            t1.inpadoc_family_id AS inpadoc_family_id
    FROM
            `usptobias.results.USPatents` AS t1
        LEFT JOIN
            `usptobias.patstat.tls201` AS t2
        ON
            t1.inpadoc_family_id=t2.inpadoc_family_id
    WHERE
            t1.appln_id<>t2.appln_id
            AND t2.appln_filing_year BETWEEN 2001 AND 2018
            AND t2.appln_kind = 'A '
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

In [None]:
print("This query will process {:.2f} Gb.".format(
    query_job.total_bytes_processed/1e9))

In [234]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f8640227240>

## 2.2. Getting Information of USOnInpadoc table

In [235]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USOnInpadoc'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_family_id', 'INTEGER', 'NULLABLE', None, ())]
None

There are 51,294,652 number of records in total!


## 2.3. Second Query: Counting number of pairs

In [236]:
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False
# Set configuration.query.createDisposition
job_config.create_disposition = 'CREATE_IF_NEEDED'
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_APPEND'

# Set the destination table
dataset_id = 'results'
table_id = 'USOnInpadoc2'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [237]:
# Number of pairs
sql = """
    SELECT
            t1.*,
            t2.inpadoc_nb_pairs
    FROM
            `usptobias.results.USOnInpadoc` AS t1
        LEFT JOIN
            (
            SELECT
                    inpadoc_family_id,
                    appln_auth_2,
                    COUNT(*) AS inpadoc_nb_pairs
            FROM
                    `usptobias.results.USOnInpadoc`
            GROUP BY
                    inpadoc_family_id, appln_auth_2
            ) AS t2
        ON
            t1.inpadoc_family_id=t2.inpadoc_family_id AND t1.appln_auth_2 = t2.appln_auth_2
     """
# Defining the query
query_job = client.query(
    sql,
    # Location must match that of the dataset(s) referenced in the query.
    location='US',
    job_config=job_config)

In [238]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f863ebe3eb8>

## 2.4. Getting the information of "USOnInpadoc2" table

In [239]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USOnInpadoc2'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API Request

# View table properties
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_nb_pairs', 'INTEGER', 'NULLABLE', None, ())]
None

There are 51,294,652 number of records in total!


## 2.5. Loading 100 rows from the table

In [240]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USOnInpadoc2'

dataset_ref = client.dataset(dataset_id, project="usptobias")
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)  # API call

In [241]:
## Getting the first 100 lines of codes
# Load the first 10 rows
rows = client.list_rows(table, max_results=10000)

#pd.DataFrame(rows)
df = rows.to_dataframe()

In [242]:
df.head(7)

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,inpadoc_family_id,inpadoc_nb_pairs
0,496646851,201815863684,US,2018,496643370,201815863608,US,2018,10956612,42
1,497185155,201815869955,US,2018,496706232,102018200772,DE,2018,10959988,1
2,497501666,201815860260,US,2018,497502448,201815876278,US,2018,10864254,1260
3,496933695,201815866128,US,2018,496931950,201815866157,US,2018,10962664,56
4,496063457,201815895416,US,2018,497509975,201815895377,US,2018,8440433,6
5,496659152,201815896116,US,2018,497509217,201815924461,US,2018,10895907,1482
6,494671554,201815875812,US,2018,494674983,201815875771,US,2018,10715184,12


## 2.5. Deleting the table

In [231]:
# from google.cloud import bigquery
# client = bigquery.Client()
dataset_id = 'results'
table_id = 'USOnInpadoc2'

table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)  # API request

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results:USOnInpadoc2 deleted.


# 3. Extrating Exact_twins table

In [244]:
client = bigquery.Client()

## 3.1. Query to generate the table

In [245]:
job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False

# Set destination table
dataset_id = 'results'
table_id = 'USExactTwins'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [246]:
sql= """
    SELECT
            *,
            (
            CASE 
                    WHEN inpadoc_nb_pairs=1 THEN 1
                    ELSE 0
            END
            ) AS exact_twins
    FROM
            `usptobias.results.USOnInpadoc2`
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [247]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f863f126080>

## 3.2. Getting the table information

In [248]:
dataset_id = 'results'
table_id = 'USExactTwins'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_nb_pairs', 'INTEGER', 'NULLABLE', None, ()), SchemaField('exact_twins', 'INTEGER', 'NULLABLE', None, ())]
None

There are 51,294,652 number of records in total!


## 3.3. Loading the first 1000 rows of the table

In [249]:
dataset_id = 'results'
table_id = 'USExactTwins'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [250]:
df.head(7)

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,inpadoc_family_id,inpadoc_nb_pairs,exact_twins
0,53733918,83752604,US,2004,496318637,201815889702,US,2018,6602696,44732,0
1,51456560,51148806,US,2006,497553194,2018080942,JP,2018,8839496,132,0
2,353121931,201113337217,US,2011,496036095,201815900352,US,2018,12535070,992,0
3,365749427,201213456945,US,2012,497172619,201815919067,US,2018,7580252,20592,0
4,419189941,201314099750,US,2013,495775446,2018024964,JP,2018,9599287,990,0
5,438344439,201414496152,US,2014,494235017,20185380,FI,2018,9470143,16458,0
6,450218079,201414888189,US,2014,494684204,2018000117,JP,2018,8786039,480,0


## 3.4. Deleting "USExactTwins" table

In [243]:
dataset_id = 'results'
table_id = 'USExactTwins'
# Deleting the table
table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results:USExactTwins deleted.


# 4. Selecting Only "Ecaxt Twins"

In [252]:
client = bigquery.Client()

## 4.1. Query

In [253]:
job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False

# Set destination table
dataset_id = 'results'
table_id = 'USOnlyTwins'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [254]:
sql= """
    SELECT
            * EXCEPT (exact_twins, inpadoc_nb_pairs)
    FROM
            `usptobias.results.USExactTwins`
    WHERE
            exact_twins=1
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [255]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f863df20908>

## 4.2. Getting the resulting table info

In [256]:
dataset_id = 'results'
table_id = 'USOnlyTwins'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_1', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_nr_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('appln_filing_year_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_family_id', 'INTEGER', 'NULLABLE', None, ())]
None

There are 6,123,214 number of records in total!


## 4.3. Loading the 10,000 rows of table

In [257]:
dataset_id = 'results'
table_id = 'USOnlyTwins'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [258]:
df.head(7)

Unnamed: 0,appln_id_1,appln_nr_1,appln_auth_1,appln_filing_year_1,appln_id_2,appln_nr_2,appln_auth_2,appln_filing_year_2,inpadoc_family_id
0,494659254,201815876219,US,2018,496213531,2018001115,CO,2018,10310109
1,494041211,201815863068,US,2018,491943459,20180064,NO,2018,10253539
2,497506851,201815926796,US,2018,902766114,201821004104,IN,2018,10971016
3,494270721,201815869058,US,2018,496185654,2018001288,CO,2018,12787003
4,496060625,201815897495,US,2018,497230395,2018007120,CO,2018,10668083
5,495174396,201815883365,US,2018,497301585,12018550007,PH,2018,10286942
6,496058741,201815896364,US,2018,496213854,2018000422,CO,2018,10310060


## 4.4. Deleting the table

In [251]:
dataset_id = 'results'
table_id = 'USOnlyTwins'
# Deleting the table
table_ref = client.dataset(dataset_id).table(table_id)
#client.delete_table(table_ref)

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results:USOnlyTwins deleted.


## 4.5. Exporting Table to CSV on the Bucket

In [260]:
# from google.cloud import bigquery
client = bigquery.Client()
bucket_name = 'patent-results'
folder_name = 'exactTwins'

project = 'usptobias'
dataset_id = 'results'
table_id = 'USOnlyTwins'

destination_uri = 'gs://{}/{}/{}'.format(bucket_name, folder_name, 'only_exact_twins_inpadoc.csv.gz')
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

In [261]:
job_config = bigquery.job.ExtractJobConfig()

job_config.compression = "GZIP"
job_config.destinationFormat = "CSV"

In [262]:
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location='US',
    job_config=job_config)  # API request
extract_job.result()  # Waits for job to complete.

print('Exported {}:{}.{} to {}'.format(
    project, dataset_id, table_id, destination_uri))

Exported usptobias:results.USOnlyTwins to gs://patent-results/exactTwins/only_exact_twins_inpadoc.csv.gz


# 5. Adding Priority filing information

In [54]:
client = bigquery.Client()

## 5.1. Query

In [59]:
job_config = bigquery.QueryJobConfig()
job_config.user_query_cache = False

# Set destination table
dataset_id = 'results'
table_id = 'familyInfo'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

In [60]:
sql= """
    SELECT
            *,
            (
            CASE 
                    WHEN appln_id_1 NOT IN(
                                    SELECT DISTINCT appln_id
                                    FROM `usptobias.patstat.tls204`) THEN 1
                    ELSE 0
            END
            ) AS is_1_pf,
            (
            CASE 
                    WHEN appln_id_2 NOT IN(
                                    SELECT DISTINCT appln_id
                                    FROM `usptobias.patstat.tls204`) THEN 1
                    ELSE 0
            END
            ) AS is_2_pf
    FROM
            `usptobias.results.USOnlyTwins`
    """

# Defining the query job
query_job = client.query(sql, location='US', job_config=job_config)

In [61]:
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x1d6e4d273c8>

# 5.2. Getting the resulting table info

In [62]:
dataset_id = 'results'
table_id = 'familyInfo'
# Getting the table's information
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Print tables information
print(table.schema)
print(table.description)
print("\nThere are {:,} number of records in total!".format(table.num_rows))

[SchemaField('appln_id_1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_id_2', 'INTEGER', 'NULLABLE', None, ()), SchemaField('appln_auth_2', 'STRING', 'NULLABLE', None, ()), SchemaField('inpadoc_family_id', 'INTEGER', 'NULLABLE', None, ()), SchemaField('inpadoc_family_size', 'INTEGER', 'NULLABLE', None, ()), SchemaField('is_1_pf', 'INTEGER', 'NULLABLE', None, ()), SchemaField('is_2_pf', 'INTEGER', 'NULLABLE', None, ())]
None

There are 29,978,803 number of records in total!


## 5.3. Loading the first 1000 rows

In [196]:
dataset_id = 'results'
table_id = 'familyInfo'
# Getting the table object
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref)

# Load the first 10000 rows into RowIterator object
rows = client.list_rows(table, max_results=10000)

# Loading the RowIterator object into Pandas dataframe
df = rows.to_dataframe()

In [197]:
df.head(8)

Unnamed: 0,appln_id_1,appln_id_2,appln_auth_2,inpadoc_family_id,inpadoc_family_size,is_1_pf,is_2_pf
0,51510977,42482209,PL,1522825,1,0,0
1,57949540,416251870,MY,4923495,1,0,0
2,51997843,417445404,UA,347404,1,0,0
3,51019701,329489966,PE,351050,1,0,0
4,45795525,341866030,MY,1437853,1,0,0
5,440492173,483990226,CL,9684563,1,0,0
6,49126293,41501576,NL,1488896,1,0,0
7,273397156,54687583,UY,7303993,1,0,0


## 5.4. Deleting the table

In [58]:
dataset_id = 'results'
table_id = 'familyInfo'
# Deleting the table
table_ref = client.dataset(dataset_id).table(table_id)
client.delete_table(table_ref)

print("Table {}:{} deleted.".format(dataset_id, table_id))

Table results:familyInfo deleted.


# 6. Comparing the final results with the existing Family_Information

In [211]:
import os
import zipfile

In [204]:
data_folder = '../family_patents/data/'

In [208]:
# Checking the number of records in family_information
family_zip = zipfile.ZipFile(data_folder+'familyInformation.csv.zip')
lines = sum(1 for line in family_zip.open('familyInformation.csv'))
#lines = sum(1 for line in open(data_folder+'familyInformation.csv'))

In [209]:
print('There are {:,} records in familyInformation CSV file, in total!'.format(lines))

There are 17,218,990 records in familyInformation CSV file, in total!


In [267]:
df_twins = pd.read_csv(data_folder+'familyInformation.csv.zip', sep=';',
                 usecols=[0,1,7,12,13], compression='zip')

df_twins.head()

Unnamed: 0,docdb_family_id,appln_id_1,appln_id_2,family_size_in_auth2,exact_twins
0,8164667,448453,267441867,1,1
1,8164667,448453,267566903,1,1
2,39810287,448454,448456,12,0
3,39810287,448454,449293,12,0
4,39810287,448454,449431,12,0


In [268]:
print('There are {:,} rows in df_twins!'.format(df_twins.shape[0]))

There are 17,218,989 rows in df_twins!


In [270]:
# Number of pairs which are exact twins
print('There are {:,} exact twins in df_twins!'.format(df_twins[df_twins.exact_twins==1].shape[0]))

There are 5,638,804 exact twins in df_twins!


In [21]:
# Counting real family size
df_family = df_twins.groupby(by=['docdb_family_id'])[['appln_id_1']].count()\
                    .rename(columns={'appln_id_1':'docdb_family_size'}).reset_index()
# Adding the family size column to df_twins
df_twins = pd.merge(df_twins, df_family, how='left', on='docdb_family_id')

print("There are {:,} rows!".format(df_twins.shape[0]))
df_twins.head()

There are 5,638,804 rows!


Unnamed: 0,docdb_family_id,appln_id_1,appln_id_2,family_size_in_auth2,exact_twins,docdb_family_size
0,8164667,448453,267441867,1,1,2
1,8164667,448453,267566903,1,1,2
2,32696793,453655,16252011,1,1,2
3,32696793,453655,21448310,1,1,2
4,37430888,453656,2748891,1,1,4


In [22]:
# Counting the real "exact twins"
print("There are {:,} records of exact twins!"
      .format((df_twins[df_twins.docdb_family_size==1]).shape[0]))

There are 642,114 records of exact twins!
