# Data Cleaning

[acquisitions](#acquisitions)
 | ~~[application](#application)~~ (removed)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

## Filenames
### Raw original files (import)

In [1]:
acquisitionsData = "acquisitions.csv"
assigneeData = "rawassignee.tsv"
cpc_currentData = "cpc_current.tsv"
groupData = "cpc_group.tsv"
subgroupData = "cpc_subgroup.tsv"
subsectionData = "cpc_subsection.tsv"
patentData = "patent.tsv"
locationData = "location.tsv"
location_assigneeData = "location_assignee.tsv"
us_term_of_grantData = "us_term_of_grant.tsv"

dataFiles = [acquisitionsData, assigneeData, cpc_currentData, groupData, subgroupData, 
         subsectionData, patentData, locationData, location_assigneeData, us_term_of_grantData]

### Clean:  retains all company patent records (export)

In [2]:
acquisitionsClean = "acquisitions_clean.tsv"
assigneeClean = "assignee_clean.tsv"
cpc_currentClean = "cpc_current_clean.tsv"
groupClean = "group_clean.tsv"
subgroupClean = "subgroup_clean.tsv"
subsectionClean = "subsection_clean.tsv"
patentClean = "patent_clean.tsv"
locationClean = "location_clean.tsv"
location_assigneeClean = "location_assignee_clean.tsv"
us_term_of_grantClean = "us_term_of_grant_clean.tsv"

### Trimmed:  retains only patents related to 7 companies and their acquisitions (export)

In [3]:
acquisitionsFile = "acquisitions_clean.tsv"
assigneeFile = "trimAssignee_clean.tsv"
cpc_currentFile = "trimCPC_current_clean.tsv"
groupFile = "group_clean.tsv"
subgroupFile = "subgroup_clean.tsv"
subsectionFile = "subsection_clean.tsv"
patentFile = "trimPatent_clean.tsv"
locationFile = "trimLocation_clean.tsv"
location_assigneeFile = "trimLocation_assignee_clean.tsv"
us_term_of_grantFile = "trimUS_term_of_grant_clean.tsv"

exportFiles = [acquisitionsFile, assigneeFile, cpc_currentFile, groupFile,
              subgroupFile, subsectionFile, patentFile, locationFile,
              location_assigneeFile, us_term_of_grantFile]

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Library to aid in string matching for identifying company names

# To install:
#     pip install fuzzywuzzy
#     pip install python-Levenshtein

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## Functions

In [45]:
def read(url):
    if (url[-3]=="c"):
        data = pd.read_csv(url, header=0, error_bad_lines=False)
    else:
        data = pd.read_csv(url, sep="\t", header=0, error_bad_lines=False)
    df = pd.DataFrame(data)
    return df

In [6]:
def year(date):
    return date[0:4]

In [7]:
def export(df, filename):
    df.to_csv(filename, index=False, sep='\t', encoding='utf-8')

In [None]:
# def readAll():
#     dfs = [dfAcquisitions, dfAssignee, dfCPC_current, dfGroup, dfSubgroup, dfSubsection, 
#        dfPatent, dfLocation, dfLocation_assignee, dfUS_term_of_grant]
    
#     for i in range(len(dfs)):
#         dfs[i]=read(dataFiles[i])
# readAll()

In [None]:
# def exportAll():
#     dfs = [dfAcquisitions, trimAssignee, trimCPC_current, dfGroup, dfSubgroup, dfSubsection, 
#        trimPatent, trimLocation, trimLocation_assignee, trimUS_term_of_grant]
    
#     for i in range(len(dfs)):
#         export(dfs[i],exportFiles[i])
# exportAll()

## acquisitions
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [87]:
# read in data
dfAcquisitions = read(acquisitionsData)
dfAcquisitions.head()

Unnamed: 0,AcquisitionID,AcquisitionMonth,AcquisitionMonthDate,AcquisitionYear,Company,Business,Country,Value (USD),Derived products,ParentCompany
0,ACQ99,November,11.0,2015,bebop,Cloud software,USA,380000000.0,Google Cloud Platform,Google
1,ACQ98,November,11.0,2015,Fly Labs,Video editing,USA,,Google Photos,Google
2,ACQ97,December,8.0,2015,Clearleap,Cloud-based video management,USA,,,IBM
3,ACQ96,December,18.0,2015,Metanautix,Big Data Analytics,USA,,,Microsoft
4,ACQ95,December,21.0,2015,"Talko, Inc.",Mobile communications,USA,,,Microsoft


In [88]:
# clean table
acquisitions = pd.DataFrame(dfAcquisitions).copy(deep=True)
acquisitions = acquisitions.drop(columns=['AcquisitionID',
                                          'AcquisitionMonth',
                                          'AcquisitionMonthDate',
                                          'Country',
                                          'Business',
                                          'Value (USD)',
                                          'Derived products'])
acquisitions = acquisitions.rename(columns={"Company": "ChildCompany"})
acquisitions.head()

Unnamed: 0,AcquisitionYear,ChildCompany,ParentCompany
0,2015,bebop,Google
1,2015,Fly Labs,Google
2,2015,Clearleap,IBM
3,2015,Metanautix,Microsoft
4,2015,"Talko, Inc.",Microsoft


In [89]:
# export to file
export(acquisitions, acquisitionsFile)

## assignee
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [12]:
dfAssignee = read(assigneeData)
dfAssignee.head()

Unnamed: 0,uuid,patent_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,sequence
0,0000p94wkezw94s8cz7dbxlvz,5856666,eaa92f175be7bfb71011f17eafb1e71f,orskbf54s58e97lkmw8na5rpx,2,,,U.S. Philips Corporation,0
1,00013vk881wap9u4mbo7lwwhp,5204210,e572ad43a89039b0d72acc4ce970a33f,mue862v5lcjdhzqqk86ei75kj,2,,,Xerox Corporation,0
2,000192sn2u10kzpikl4s7h3r0,5302149,8ce825a978eebf26ad2c13de6e370bb3,o1h9dqdv0yq7dt1b1vmrcal9h,3,,,Commonwealth Scientific & Industrial Research ...,1
3,0001ycvv6sz1ju07ss99nhxi1,9104354,6c00cb129070696ef109f6264da00318,rspbpqcajvm09r1ew9mgnpx37,3,,,Canon Kabushiki Kaisha,0
4,0001z7ws4m14aqdb3tv99u550,6584517,dabf354c29a6ebba31f54b9ed042241d,l1gyelp5jcg0hakk9smmhsdgr,2,,,Cypress Semiconductor Corp.,0


In [13]:
# clean table
assignee = pd.DataFrame(dfAssignee).copy(deep=True)
assignee = assignee.drop(columns=['uuid',
                                  'type',
                                  'rawlocation_id',
                                  'name_first',
                                  'name_last',
                                  'sequence'])
assignee.head()

Unnamed: 0,patent_id,assignee_id,organization
0,5856666,eaa92f175be7bfb71011f17eafb1e71f,U.S. Philips Corporation
1,5204210,e572ad43a89039b0d72acc4ce970a33f,Xerox Corporation
2,5302149,8ce825a978eebf26ad2c13de6e370bb3,Commonwealth Scientific & Industrial Research ...
3,9104354,6c00cb129070696ef109f6264da00318,Canon Kabushiki Kaisha
4,6584517,dabf354c29a6ebba31f54b9ed042241d,Cypress Semiconductor Corp.


### Trim Assignee File

Retaining only tuples for child and parent companies in acquisitions

In [14]:
# Step 1. obtain child company list from acquisitions table

childCompanies = acquisitions['ChildCompany']
childCompanies = childCompanies.reset_index()
childCompanies = childCompanies.drop(columns='index')
childCompanies = childCompanies.rename(columns={'ChildCompany':'organization'})
childCompanies.head()

Unnamed: 0,organization
0,bebop
1,Fly Labs
2,Clearleap
3,Metanautix
4,"Talko, Inc."


In [15]:
# Step 2. obtain child assignee_ids from cleaned assignee table

allChild = pd.merge(assignee, childCompanies, on="organization", how="inner")
allChild = allChild.drop_duplicates(keep='first')
allChild.head()

Unnamed: 0,patent_id,assignee_id,organization
0,8996945,1befe0f76aca2d46fc9d4453d2db10aa,"Aspera, Inc."
1,8583977,1befe0f76aca2d46fc9d4453d2db10aa,"Aspera, Inc."
2,8214707,1befe0f76aca2d46fc9d4453d2db10aa,"Aspera, Inc."
3,8085781,1befe0f76aca2d46fc9d4453d2db10aa,"Aspera, Inc."
4,8719443,1befe0f76aca2d46fc9d4453d2db10aa,"Aspera, Inc."


In [94]:
len(allChild)

215

In [16]:
# Step 3. remaining child companies based on those that have assignee_ids

matchChild = allChild['organization'].drop_duplicates(keep="first")
matchChild = matchChild.reset_index()
matchChild = matchChild.drop(columns="index")
matchChild.head()

Unnamed: 0,organization
0,"Aspera, Inc."
1,"Canesta, Inc."
2,Isogon Corporation
3,"Overture Services, Inc."
4,Urchin Software Corporation


In [95]:
len(matchChild)

34

In [17]:
# Sanity check:
childIDs = allChild['assignee_id'].drop_duplicates(keep="first")

In [18]:
print("child companies in acquisitions table:", childCompanies.size,
      ", in assignee table:", matchChild.size,
      "\n# distinct assignee IDs:", childIDs.size)

child companies in acquisitions table: 916 , in assignee table: 34 
# distinct assignee IDs: 34


#### String matching for parent company name with with assignee table

In [None]:
# Test out specific companies to make sure selection is correct:

# Apple has 16591 patents
assignee[assignee['organization'].str.match('Apple ', na=False)]
# Twitter has 70 patents, one assignee_id
assignee[assignee['organization'].str.match('Twitter', na=False)]
# Google has 15480 patents
assignee[assignee['organization'].str.match('Google', na=False)]
# Facebook has 1901 patents under Facebook, Inc. + 16 under Facebook Inc.
assignee[assignee['organization'].str.match('Facebook', na=False)]
assignee[assignee['organization'].str.match('Facebook ', na=False)]
# Microsoft has 36174 patents
assignee[assignee['organization'].str.match('Microsoft', na=False)]
# IBM has 550 patents under IBM and 110339 under International Business Machines Corporation
assignee[assignee['organization'].str.match('International Business Machines', na=False)]
assignee[assignee['organization'].str.match('IBM', na=False)]
# Yahoo has 2245 patents
assignee[assignee['organization'].str.match('Yahoo', na=False)]

In [65]:
# Step 4: using parent company general name, find all other spelling variations
# Note, this is not performed with child companies to reduce chance of incorrect matches

parentCompanies = ["Google", "Microsoft", "Facebook", "Facebook ", "Apple ", "Yahoo", "Twitter", "IBM", "International Business Machines"]
# space after Apple, Twitter, IBM is intentional to eliminate false results, e.g. "Applera"

for i in range(len(parentCompanies)):
    # find all variations on parent company names
    parent = assignee[assignee['organization'].str.match(parentCompanies[i], na=False)]
    parent = parent['organization'].drop_duplicates(keep='first')
    parent = parent.reset_index()
    parent = parent.drop(columns='index')
    if (i==3 or i==8):
        parent['org'] = parentCompanies[i-1]
    else:
        parent['org'] = parentCompanies[i]
    
    if (i==0):
        allParent = parent
    else:
        allParent = pd.concat([allParent,parent], axis=0, sort=True)
        
allParent.head()

Unnamed: 0,org,organization
0,Google,Google Inc.
1,Google,"Google, Inc."
2,Google,Google Technology Holdings LLC
3,Google,Google LLC
4,Google,Google Inc


In [99]:
allParent = allParent.drop_duplicates(keep="first")
len(allParent)

275

In [100]:
allParent

Unnamed: 0,org,organization
0,Google,Google Inc.
1,Google,"Google, Inc."
2,Google,Google Technology Holdings LLC
3,Google,Google LLC
4,Google,Google Inc
5,Google,Google Technology Holdings LLC.
6,Google,"Google Technology Holdings, LLC"
7,Google,Google LLP
8,Google,Google Technology Holdings Inc.
9,Google,Google inc.


In [101]:
# Step 5: obtain all parent company assignee ids

parentIDs = pd.merge(assignee, allParent, on="organization", how="inner")
parentIDs = parentIDs.drop_duplicates(keep='first')
parentIDs.head()

Unnamed: 0,patent_id,assignee_id,organization,org
0,6000832,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
1,7458030,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
2,D766984,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
3,D702184,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
4,8244819,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft


In [102]:
parentIDs

Unnamed: 0,patent_id,assignee_id,organization,org
0,6000832,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
1,7458030,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
2,D766984,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
3,D702184,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
4,8244819,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
5,7904883,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
6,D726203,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
7,8478837,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
8,7126606,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
9,8948213,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft


In [103]:
parentCheck = parentIDs['organization'].drop_duplicates(keep="first")
parentAssigneeID = parentIDs['assignee_id'].drop_duplicates(keep="first")

In [104]:
print("parent companies in acquisitions table:", 7,
      ", in assignee table:", len(allParent), "=", len(parentCheck), "(if not equal, check merge)",
      "\n# distinct assignee IDs:", parentAssigneeID.size)

parent companies in acquisitions table: 7 , in assignee table: 275 = 275 (if not equal, check merge) 
# distinct assignee IDs: 63


In [105]:
# Step 3: join child and parent companies into one list
companies = pd.concat([childCompanies, allParent], axis=0, sort=True)

In [114]:
# Step 4: trim assignee table based on company names
trimAssignee = pd.merge(assignee, companies, on="organization", how="inner").drop_duplicates(keep='first')

In [115]:
trimAssignee.head()

Unnamed: 0,patent_id,assignee_id,organization,org
0,6000832,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
1,7458030,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
2,D766984,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
3,D702184,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft
4,8244819,237c2b0099548ddbfa5a37f07e0687ab,Microsoft Corporation,Microsoft


In [116]:
# Step 5: make new tsv file
export(trimAssignee, assigneeFile)

In [122]:
# reduced file to aid in merging
assignee_patents = trimAssignee.drop(columns=['organization','org']).drop_duplicates(keep='first')
assignee_patents.head()

Unnamed: 0,patent_id,assignee_id
0,6000832,237c2b0099548ddbfa5a37f07e0687ab
1,7458030,237c2b0099548ddbfa5a37f07e0687ab
2,D766984,237c2b0099548ddbfa5a37f07e0687ab
3,D702184,237c2b0099548ddbfa5a37f07e0687ab
4,8244819,237c2b0099548ddbfa5a37f07e0687ab


In [123]:
# Sanity check
justAssignee = trimAssignee.drop(columns=['organization','patent_id','org']).drop_duplicates(keep='first')
justPatents = trimAssignee.drop(columns=['organization','assignee_id','org']).drop_duplicates(keep='first')
justOrganization = trimAssignee.drop(columns=['assignee_id','patent_id']).drop_duplicates(keep='first')
justAssigneeTrim = assignee_patents.drop(columns=['patent_id']).drop_duplicates(keep='first')
justPatentTrim = assignee_patents.drop(columns=['assignee_id']).drop_duplicates(keep='first')

In [126]:
print("in acquisition table, distinct child companies:", len(childCompanies),
      "distinct parent companies:", 7,
      "\n# parent companies spelling variations: ", len(allParent),
      "\nall companies in acquisitions table (includes name variations):", len(companies),
      "\ncompanies with assignee_id:", len(justAssignee), "=", len(justAssigneeTrim),
      "\ndistinct patents:", len(justPatents), "=", len(justPatentTrim),
      "\norganizations:",len(justOrganization), "out of", len(companies), "retained")

in acquisition table, distinct child companies: 916 distinct parent companies: 7 
# parent companies spelling variations:  275 
all companies in acquisitions table (includes name variations): 1191 
companies with assignee_id: 97 = 97 
distinct patents: 183561 = 183561 
organizations: 309 out of 1191 retained


## cpc_current
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [127]:
dfCPC = read(cpc_currentData)
dfCPC.head()

Unnamed: 0,uuid,patent_id,section_id,subsection_id,group_id,subgroup_id,category,sequence
0,pkscb392v7o62jgv5e3tdyct2,3930272,A,A47,A47D,A47D7/02,inventional,0
1,s3x865wlce64sbxds2ubgheu4,3930272,Y,Y10,Y10T,Y10T403/32451,additional,1
2,638qht0an0gg7dc1ub3yeexm2,3930273,A,A61,A61G,A61G7/0507,inventional,0
3,34qqyersco0qh8jsn9bhdxy88,3930273,A,A61,A61G,A61G7/0509,inventional,1
4,sefi1no1nlmw0jjn82s68bhmq,3930274,B,B63,B63B,B63B7/085,inventional,0


In [128]:
# clean table
cpc_current = pd.DataFrame(dfCPC).copy(deep=True)
cpc_current = cpc_current.drop(columns=['uuid',
                                        'category',
                                        'sequence'])

In [129]:
# trim table
dtype = dict(patent_id=str)
trimCPC_current = cpc_current.astype(dtype).merge(assignee_patents.astype(dtype), 'inner')
trimCPC_current = trimCPC_current.drop(columns='assignee_id')
trimCPC_current = trimCPC_current.drop_duplicates(keep="first")
trimCPC_current.head()

Unnamed: 0,patent_id,section_id,subsection_id,group_id,subgroup_id
0,3930729,G,G01,G01B,G01B9/02097
1,3930729,G,G01,G01B,G01B2290/25
2,3930857,G,G03,G03F,G03F1/54
3,3930857,G,G03,G03F,G03F1/50
4,3930857,G,G03,G03F,G03F7/022


In [130]:
# Sanity check
justPatent = trimCPC_current['patent_id'].drop_duplicates(keep='first')

In [132]:
print("rows original:", len(dfCPC),
      " rows trimmed:", len(trimCPC_current),
      "\n# distinct patents in trim:", len(justPatent))

rows original: 35208720  rows trimmed: 842357 
# distinct patents in trim: 175504


In [133]:
# export file to tsv
export(trimCPC_current, cpc_currentFile)

## group
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [105]:
dfGroup = read(groupData)
dfGroup.head()

Unnamed: 0,id,title
0,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
1,A01C,PLANTING; SOWING; FERTILISING
2,A01D,HARVESTING; MOWING
3,A01F,PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW ...
4,A01G,"HORTICULTURE; CULTIVATION OF VEGETABLES, FLOWE..."


In [106]:
export(dfGroup, groupFile)

## subgroup
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [107]:
dfSubgroup = read(subgroupData)
dfSubgroup.head()

Unnamed: 0,id,title
0,A01B1/00,Hand tools
1,A01B1/02,Hand tools -Spades; Shovels
2,A01B1/022,Hand tools -Spades; Shovels -Collapsible; exte...
3,A01B1/024,Hand tools -Spades; Shovels -Foot protectors a...
4,A01B1/026,Hand tools -Spades; Shovels -with auxiliary ha...


In [108]:
export(dfSubgroup,subgroupFile)

## subsection
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [109]:
dfSubsection = read(subsectionData)
dfSubsection.head()

Unnamed: 0,id,title
0,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
1,A21,BAKING; EDIBLE DOUGHS
2,A22,BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY...
3,A23,"FOODS OR FOODSTUFFS; THEIR TREATMENT, NOT COVE..."
4,A24,TOBACCO; CIGARS; CIGARETTES; SMOKERS' REQUISITES


In [110]:
export(dfSubsection,subsectionFile)

## patent
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [134]:
dfPatent = read(patentData)
dfPatent.head()

b'Skipping line 4243120: expected 11 fields, saw 12\n'
b'Skipping line 4277941: expected 11 fields, saw 12\nSkipping line 4308329: expected 11 fields, saw 12\n'
b'Skipping line 4348258: expected 11 fields, saw 12\nSkipping line 4390841: expected 11 fields, saw 12\n'
b'Skipping line 4400719: expected 11 fields, saw 12\n'
  if self.run_code(code, result):


Unnamed: 0,id,type,number,country,date,abstract,title,kind,num_claims,filename,withdrawn
0,3930271,utility,3930271,US,1976-01-06,A golf glove is disclosed having an extra fin...,Golf glove,A,4.0,pftaps19760106_wk01.zip,0.0
1,3930272,utility,3930272,US,1976-01-06,A lock for a height-adjustable crib or plaype...,Crib leg lock,A,3.0,pftaps19760106_wk01.zip,0.0
2,3930273,utility,3930273,US,1976-01-06,A bed safety side rail arrangement which incl...,Bed safety side rail arrangement,A,24.0,pftaps19760106_wk01.zip,0.0
3,3930274,utility,3930274,US,1976-01-06,The assembly includes a longitudinal axis and...,Assembly for use in recreational activities,A,7.0,pftaps19760106_wk01.zip,0.0
4,3930275,utility,3930275,US,1976-01-06,A novel slipper and its method of fabrication...,Method of fabricating a slipper,A,9.0,pftaps19760106_wk01.zip,0.0


In [135]:
# clean table
patent = pd.DataFrame(dfPatent).copy(deep=True)
patent = patent[patent['country'] == 'US']
patent = patent.drop(columns=['type','country','kind','filename','withdrawn'])
patent.head()

Unnamed: 0,id,number,date,abstract,title,num_claims
0,3930271,3930271,1976-01-06,A golf glove is disclosed having an extra fin...,Golf glove,4.0
1,3930272,3930272,1976-01-06,A lock for a height-adjustable crib or plaype...,Crib leg lock,3.0
2,3930273,3930273,1976-01-06,A bed safety side rail arrangement which incl...,Bed safety side rail arrangement,24.0
3,3930274,3930274,1976-01-06,The assembly includes a longitudinal axis and...,Assembly for use in recreational activities,7.0
4,3930275,3930275,1976-01-06,A novel slipper and its method of fabrication...,Method of fabricating a slipper,9.0


In [None]:
# trim table
dtype1 = dict(id=str)
dtype2 = dict(patent_id=str)
trimPatent = pd.merge(patent.astype(dtype1), assignee_patents.astype(dtype2), left_on='id', right_on='patent_id', how='inner')
trimPatent = trimPatent.drop(columns=['assignee_id','id','number','abstract'])
trimPatent = trimPatent.drop_duplicates(keep='first')
trimPatent.head()

In [None]:
trimPatent['year'] = trimPatent['date'].apply(year)
trimPatent.head()

In [140]:
print("rows original: ", len(dfPatent), 
      "rows trimmed: ", len(trimPatent))

rows original:  6657472 rows trimmed:  183453


In [112]:
export(trimPatent, patentFile)

## location_assignee
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [144]:
# original data
dfLocationAssignee = read(location_assigneeData)
dfLocationAssignee.head()

Unnamed: 0,location_id,assignee_id
0,wy09e6twn0s1,eaa92f175be7bfb71011f17eafb1e71f
1,406gqa22ukdm,e572ad43a89039b0d72acc4ce970a33f
2,lulvs12ykwd7,8ce825a978eebf26ad2c13de6e370bb3
3,tdk0ut5vx9ki,6c00cb129070696ef109f6264da00318
4,rfmxwk4iedfc,dabf354c29a6ebba31f54b9ed042241d


In [145]:
# trim table
dtype = dict(assignee_id=str)
trimLocation_assignee = dfLocationAssignee.astype(dtype).merge(assignee_patents.astype(dtype), 'inner')
trimLocation_assignee = trimLocation_assignee.drop(columns='patent_id')
trimLocation_assignee = trimLocation_assignee.drop_duplicates(keep="first")
trimLocation_assignee.head()

Unnamed: 0,location_id,assignee_id
0,qxpuc04niivb,b12e2d8b345facb1f8bf63ce20573dd4
877,6qcwqs98tfh0,b12e2d8b345facb1f8bf63ce20573dd4
1754,e1aw0l0xbcd6,b12e2d8b345facb1f8bf63ce20573dd4
2631,23pta3jjpriv,237c2b0099548ddbfa5a37f07e0687ab
32362,656n7m0unkh8,237c2b0099548ddbfa5a37f07e0687ab


In [146]:
print("rows original: ", dfLocationAssignee.size/2, 
      "rows trimmed: ", trimLocation_assignee.size/2)

rows original:  558466.0 rows trimmed:  331.0


In [147]:
# sanity check
justLocation = trimLocation_assignee['location_id'].drop_duplicates(keep='first')
justAssignee = trimLocation_assignee['assignee_id'].drop_duplicates(keep='first')
print("# location_id:",justLocation.size,
     "# assignee_id:",justAssignee.size)

# location_id: 199 # assignee_id: 97


In [148]:
export(trimLocation_assignee, location_assigneeFile)

## location
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [141]:
dfLocation = read(locationData)
dfLocation.head()

b'Skipping line 7470: expected 9 fields, saw 10\nSkipping line 44098: expected 9 fields, saw 10\n'


Unnamed: 0,id,city,state,country,latitude,longitude,county,state_fips,county_fips
0,0009wn7out97,Kaiserslautern,,DE,49.4123,7.69879,,,
1,000fg4sjgke4,Kiryat Ekron,,IL,31.8575,34.8223,,,
2,000wvaottuxr,Oberostendorf,BY,DE,47.9437,10.7433,,,
3,000yomlxfffl,Spruce Pine,NC,US,35.9153,-82.0647,Grassy Creek,37.0,
4,000zaps28vbi,Bridge City Westwego,LA,US,29.9179,-90.1663,Jefferson Parish,22.0,51.0


In [142]:
# clean table
location = pd.DataFrame(dfLocation).copy(deep=True)
location = location[location['country']=='US']
location = location.drop(columns=['country','state','county','state_fips','county_fips'])
location.head()

Unnamed: 0,id,city,latitude,longitude
3,000yomlxfffl,Spruce Pine,35.9153,-82.0647
4,000zaps28vbi,Bridge City Westwego,29.9179,-90.1663
5,002ui5ctlmse,Dahlonega,34.5261,-83.9844
7,0043o8y61p3z,Rancho Bernardo,33.0186,-117.06
17,00a4lw6chrti,Levering,45.6358,-84.787


In [149]:
# trim table 
dtype1 = dict(location_id=str)
dtype2 = dict(id=str)
trimLocation = pd.merge(location.astype(dtype2), trimLocation_assignee.astype(dtype1), 
                        how='inner', right_on="location_id", left_on="id")
trimLocation = trimLocation.drop(columns=['assignee_id','location_id'])
trimLocation = trimLocation.drop_duplicates(keep="first")
trimLocation.head()

Unnamed: 0,id,city,latitude,longitude
0,00ajij86hkk9,Renton,47.4545,-122.223
1,03gsxpxq1ltp,White Plains,41.034,-73.7629
2,067ckw1czast,Secaucus,40.7895,-74.0565
3,0grbym9qhe5d,Poway,32.9628,-117.036
4,1udrm9haouwb,El Segundo,33.9192,-118.416


In [150]:
# Sanity check
justId = trimLocation['id'].drop_duplicates(keep="first")

In [151]:
print("rows original: ", location.size/4, 
      "rows trimmed: ", trimLocation.size/4,
      "\n# location ids:", justId.size)

rows original:  29065.0 rows trimmed:  145.0 
# location ids: 145


In [152]:
export(trimLocation,locationFile)

## us_term_of_grant
[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [153]:
dfGrant = read(us_term_of_grantData)
dfGrant.head()

Unnamed: 0,uuid,patent_id,lapse_of_patent,disclaimer_date,term_disclaimer,term_grant,term_extension
0,0000hq4800wcm85ghvyz34mbe,D657425,,0000-00-00,,14.0,
1,0002e24jgnd6uspokbvss6xex,D699845,,0000-00-00,,14.0,
2,0002s1xi2xuinyor7g5w5veu6,D525308,,0000-00-00,,14.0,
3,000315mx91ci258k7nwyve9pg,9193114,,0000-00-00,,,408.0
4,0003hd31mgyh3f2x7j8stmuq6,D532925,,0000-00-00,,14.0,


In [154]:
# clean table and printing out to csv
grant = pd.DataFrame(dfGrant).copy(deep=True)
grant = grant.drop(columns=['uuid',
                                       'lapse_of_patent',
                                       'term_disclaimer',
                                       'term_grant',
                                       'term_extension'])
grant.head()

Unnamed: 0,patent_id,disclaimer_date
0,D657425,0000-00-00
1,D699845,0000-00-00
2,D525308,0000-00-00
3,9193114,0000-00-00
4,D532925,0000-00-00


In [155]:
# trim table
dtype = dict(patent_id=str)
trimGrant = grant.astype(dtype).merge(assignee_patents.astype(dtype), 'inner')
trimGrant = trimGrant.drop(columns='assignee_id')
trimGrant = trimGrant.drop_duplicates(keep="first")
trimGrant.head()

Unnamed: 0,patent_id,disclaimer_date
0,7797754,0000-00-00
1,8490014,0000-00-00
2,8918651,0000-00-00
3,7661088,0000-00-00
4,D535663,0000-00-00


In [156]:
print("original rows: ", dfGrant.size/7,
      "trimmed rows:",trimGrant.size/2)

original rows:  3104452.0 trimmed rows: 123381.0


In [102]:
export(trimGrant, us_term_of_grantFile)

In [85]:
# dfGrant[dfGrant['disclaimer_date']!='0000-00-00'].head(50)

Unnamed: 0,uuid,patent_id,lapse_of_patent,disclaimer_date,term_disclaimer,term_grant,term_extension
164,002aq634gkvojyzmxlz8g0wno,D334422,,2005-03-12,,14.0,
172,002cmto548ezdiejy8i2a28qq,5089490,,2005-07-19,,,
182,002h7kmntnf61xhsn3mcrj4al,4805394,,2002-11-21,,,
249,003gr6ne1h4286un48q7mr0b1,D387548,,2011-11-11,,14.0,
268,003omfcsvfmp4lvqyymew3ai0,4591481,,2001-02-14,,,
313,004dvl0nxz3kwgzs5qe4fazf5,5216281,,2008-08-27,,,
385,005gbradp0hvjubfcfz9tjq3k,4764874,,2005-05-03,,,
514,007gbj8mt28y1lxo5cvjygjy8,5229026,,2008-10-01,,,
606,008rm9t62n6as0bjthe31xgay,D360500,,2009-06-20,,14.0,
674,009s40sos1n7ikkqzfrq7i3zx,5468151,,2011-07-19,,,


## ~~application~~

[filenames](#Filenames) 
 | [acquisitions](#acquisitions)
 | ~~[application](#application)~~
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [5]:
dfApplication = read("application.tsv")
dfApplication.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,patent_id,series_code,number,country,date
0,02/002761,D345393,2,2002761,US,1992-12-21
1,02/007691,5164715,2,2007691,US,1990-04-10
2,02/010248,5177974,2,2010248,US,1988-06-23
3,02/020141,5379515,2,2020141,US,1994-02-16
4,02/027172,5264790,2,2027172,US,1991-07-01


In [6]:
# clean table and print out to csv
application = pd.DataFrame(dfApplication).copy(deep=True)
application = application.drop(columns=['id','series_code'])
application.to_csv('application_clean.csv', index=False, sep=',', encoding='utf-8')
application.head()

Unnamed: 0,id,patent_id,country,date
0,02/002761,D345393,US,1992-12-21
1,02/007691,5164715,US,1990-04-10
2,02/010248,5177974,US,1988-06-23
3,02/020141,5379515,US,1994-02-16
4,02/027172,5264790,US,1991-07-01
