# Data Cleaning

Navigate to:  [acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

## Summary

 - acquisitions = "acquisitions_clean.csv"
 - assignee = "trimAssignee_clean.csv"
 - cpc_current = "trimCPC_current_clean.tsv"
 - group = "group_clean.tsv"  *
 - subgroup = "subgroup_clean.tsv"   *
 - subsection = "subsection_clean.tsv"  *
 - patent = "trimPatent_clean.tsv"  *
 - location = "location_clean.tsv"  *
 - location_assignee = "location_assignee_clean.csv"
 - us_term_of_grant = "us_term_of_grant_clean.csv" <br>
 <i>* .tsv because one or more columns contain commas </i>

companies = ["Google", "Microsoft", "Facebook", "Apple", "Yahoo", "Twitter", "IBM"]

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## acquisitions
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [2]:
# original data
dataAcquisitions = pd.read_csv('acquisitions.csv', header=0)
dfAcquisitions = pd.DataFrame(dataAcquisitions)
dfAcquisitions.head()

Unnamed: 0,AcquisitionID,AcquisitionMonth,AcquisitionMonthDate,AcquisitionYear,Company,Business,Country,Value (USD),Derived products,ParentCompany
0,ACQ99,November,11.0,2015,bebop,Cloud software,USA,380000000.0,Google Cloud Platform,Google
1,ACQ98,November,11.0,2015,Fly Labs,Video editing,USA,,Google Photos,Google
2,ACQ97,December,8.0,2015,Clearleap,Cloud-based video management,USA,,,IBM
3,ACQ96,December,18.0,2015,Metanautix,Big Data Analytics,USA,,,Microsoft
4,ACQ95,December,21.0,2015,"Talko, Inc.",Mobile communications,USA,,,Microsoft


In [4]:
# clean table and printing out to csv
acquisitions = pd.DataFrame(dfAcquisitions).copy(deep=True)
acquisitions = acquisitions.drop(columns=['AcquisitionID',
                                          'AcquisitionMonth',
                                          'AcquisitionMonthDate',
                                          'Country',
                                          'Business',
                                          'Value (USD)',
                                          'Derived products'])
acquisitions = acquisitions.rename(columns={"Company": "ChildCompany"})
acquisitions.to_csv('acquisitions_clean.csv', index=False, sep=',', encoding='utf-8')
acquisitions.head()

Unnamed: 0,AcquisitionYear,ChildCompany,ParentCompany
0,2015,bebop,Google
1,2015,Fly Labs,Google
2,2015,Clearleap,IBM
3,2015,Metanautix,Microsoft
4,2015,"Talko, Inc.",Microsoft


## application
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [5]:
# original data
dataApplication = pd.read_csv('application.tsv', sep="\t", header=0)
dfApplication = pd.DataFrame(dataApplication)
dfApplication.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,patent_id,series_code,number,country,date
0,02/002761,D345393,2,2002761,US,1992-12-21
1,02/007691,5164715,2,2007691,US,1990-04-10
2,02/010248,5177974,2,2010248,US,1988-06-23
3,02/020141,5379515,2,2020141,US,1994-02-16
4,02/027172,5264790,2,2027172,US,1991-07-01


In [6]:
# clean table and printing out to csv
application = pd.DataFrame(dfApplication).copy(deep=True)
application = application.drop(columns=['series_code',
                                        'number'])
application.to_csv('application_clean.csv', index=False, sep=',', encoding='utf-8')
application.head()

Unnamed: 0,id,patent_id,country,date
0,02/002761,D345393,US,1992-12-21
1,02/007691,5164715,US,1990-04-10
2,02/010248,5177974,US,1988-06-23
3,02/020141,5379515,US,1994-02-16
4,02/027172,5264790,US,1991-07-01


In [10]:
answer = dfApplication.loc[dfApplication['id'] == "14/463940"]
answer

Unnamed: 0,id,patent_id,series_code,number,country,date
5829105,14/463940,9881645,14,14463940,US,2014-08-20


## assignee
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [3]:
dataAssignee = pd.read_csv('rawassignee.tsv', sep='\t', header=0)
dfAssignee = pd.DataFrame(dataAssignee)
dfAssignee.head()

Unnamed: 0,uuid,patent_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,sequence
0,0000p94wkezw94s8cz7dbxlvz,5856666,eaa92f175be7bfb71011f17eafb1e71f,orskbf54s58e97lkmw8na5rpx,2,,,U.S. Philips Corporation,0
1,00013vk881wap9u4mbo7lwwhp,5204210,e572ad43a89039b0d72acc4ce970a33f,mue862v5lcjdhzqqk86ei75kj,2,,,Xerox Corporation,0
2,000192sn2u10kzpikl4s7h3r0,5302149,8ce825a978eebf26ad2c13de6e370bb3,o1h9dqdv0yq7dt1b1vmrcal9h,3,,,Commonwealth Scientific & Industrial Research ...,1
3,0001ycvv6sz1ju07ss99nhxi1,9104354,6c00cb129070696ef109f6264da00318,rspbpqcajvm09r1ew9mgnpx37,3,,,Canon Kabushiki Kaisha,0
4,0001z7ws4m14aqdb3tv99u550,6584517,dabf354c29a6ebba31f54b9ed042241d,l1gyelp5jcg0hakk9smmhsdgr,2,,,Cypress Semiconductor Corp.,0


In [13]:
# clean table and printing out to csv
assignee = pd.DataFrame(dfAssignee).copy(deep=True)
assignee = assignee.drop(columns=['uuid',
                                  'type',
                                  'name_first',
                                  'name_last',
                                  'sequence'])
assignee.to_csv('assignee_clean.tsv', index=False, sep='\t', encoding='utf-8')
assignee.head()

Unnamed: 0,patent_id,assignee_id,rawlocation_id,organization
0,5856666,eaa92f175be7bfb71011f17eafb1e71f,orskbf54s58e97lkmw8na5rpx,U.S. Philips Corporation
1,5204210,e572ad43a89039b0d72acc4ce970a33f,mue862v5lcjdhzqqk86ei75kj,Xerox Corporation
2,5302149,8ce825a978eebf26ad2c13de6e370bb3,o1h9dqdv0yq7dt1b1vmrcal9h,Commonwealth Scientific & Industrial Research ...
3,9104354,6c00cb129070696ef109f6264da00318,rspbpqcajvm09r1ew9mgnpx37,Canon Kabushiki Kaisha
4,6584517,dabf354c29a6ebba31f54b9ed042241d,l1gyelp5jcg0hakk9smmhsdgr,Cypress Semiconductor Corp.


### Trim Assignee File

Retaining only tuples for child and parent companies in acquisitions

In [98]:
# Step 1. obtain child company list

childCompanies = acquisitions['ChildCompany']
childCompanies = childCompanies.reset_index()
childCompanies = childCompanies.drop(columns='index')
childCompanies = childCompanies.rename(columns={'ChildCompany':'organization'})
childCompanies.head()

Unnamed: 0,organization
0,bebop
1,Fly Labs
2,Clearleap
3,Metanautix
4,"Talko, Inc."


In [125]:
# Step 2: using parent company general name, find all other variations
# Note, this step is not done with child companies to reduce chance of incorrect matches 

parentCompanies = ["Google", "Microsoft", "Facebook", "Apple ", "Yahoo", "Twitter ", "IBM ", "International Business Machines"]
# note space after Apple is intentional to eliminate "Applera"

for i in range(len(parentCompanies)):
    # find all variations on parent company names
    parent = assignee[assignee['organization'].str.match(parCompanies[i], na=False)]
    parent = parent['organization'].drop_duplicates(keep='first')
    parent = parent.reset_index()
    parent = parent.drop(columns='index')
    if (i==0):
        allParent = parent
    else:
        allParent = pd.concat([allParent,parent], axis=0, sort=True)

In [None]:
# Test out specific companies to make sure selection is correct
assignee[assignee['organization'].str.match('Apple ', na=False)]

In [126]:
# Step 3: join child and parent companies into one list
companies = pd.concat([childCompanies, allParent], axis=0, sort=True)

In [None]:
companies

In [127]:
# Step 4: trim assignee table based on company names

trimAssignee = pd.merge(assignee, companies, on="organization", how="inner")

In [128]:
trimAssignee

Unnamed: 0,patent_id,assignee_id,rawlocation_id,organization
0,6000832,237c2b0099548ddbfa5a37f07e0687ab,6uqya1s9vanjrgbu98n5msc62,Microsoft Corporation
1,7458030,237c2b0099548ddbfa5a37f07e0687ab,7ix2ctvhyeg9xwtql320uay3t,Microsoft Corporation
2,D766984,237c2b0099548ddbfa5a37f07e0687ab,ie71vuenix2erylfryrr6carv,Microsoft Corporation
3,D702184,237c2b0099548ddbfa5a37f07e0687ab,qa5ahhmnukxoceaehvelqf1gv,Microsoft Corporation
4,8244819,237c2b0099548ddbfa5a37f07e0687ab,peif0z6buri9sdtumghf10s7o,Microsoft Corporation
5,7904883,237c2b0099548ddbfa5a37f07e0687ab,mbzpgzia61xa847twygc6p4xz,Microsoft Corporation
6,D726203,237c2b0099548ddbfa5a37f07e0687ab,tgme9yc135829n5g98lrepqif,Microsoft Corporation
7,8478837,237c2b0099548ddbfa5a37f07e0687ab,diz6qfzuc1zhwtq7pp6ejbmyw,Microsoft Corporation
8,7126606,237c2b0099548ddbfa5a37f07e0687ab,pddphzp055l3l1rifh4wjgu5d,Microsoft Corporation
9,8948213,237c2b0099548ddbfa5a37f07e0687ab,q36dwr9tauioso2clgogtmwdt,Microsoft Corporation


In [129]:
# Step 5: make new tsv file
trimAssignee.to_csv('trimAssignee_clean.tsv', index=False, sep='\t', encoding='utf-8')

In [147]:
# A simpler structure with just assignee_id and patent_id
assignee_patents = trimAssignee.drop(columns=['assignee_id','rawlocation_id','organization']).drop_duplicates(keep='first')

In [146]:
assignee_patents.size

184421

## cpc_current
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [8]:
dataCPC = pd.read_csv('cpc_current.tsv', sep='\t', header=0)
dfCPC = pd.DataFrame(dataCPC)
dfCPC.head()

Unnamed: 0,uuid,patent_id,section_id,subsection_id,group_id,subgroup_id,category,sequence
0,pkscb392v7o62jgv5e3tdyct2,3930272,A,A47,A47D,A47D7/02,inventional,0
1,s3x865wlce64sbxds2ubgheu4,3930272,Y,Y10,Y10T,Y10T403/32451,additional,1
2,638qht0an0gg7dc1ub3yeexm2,3930273,A,A61,A61G,A61G7/0507,inventional,0
3,34qqyersco0qh8jsn9bhdxy88,3930273,A,A61,A61G,A61G7/0509,inventional,1
4,sefi1no1nlmw0jjn82s68bhmq,3930274,B,B63,B63B,B63B7/085,inventional,0


In [9]:
# creating "patents" table and printing out to csv
cpc_current = pd.DataFrame(dfCPC).copy(deep=True)
cpc_current = cpc_current.drop(columns=['uuid',
                                        'category',
                                        'sequence'])
cpc_current.to_csv('cpc_current_clean.csv', index=False, sep=',', encoding='utf-8')
cpc_current.head()

Unnamed: 0,patent_id,section_id,subsection_id,group_id,subgroup_id
0,3930272,A,A47,A47D,A47D7/02
1,3930272,Y,Y10,Y10T,Y10T403/32451
2,3930273,A,A61,A61G,A61G7/0507
3,3930273,A,A61,A61G,A61G7/0509
4,3930274,B,B63,B63B,B63B7/085


In [153]:
dtype = dict(patent_id=str)
trimCPC_current = cpc_current.astype(dtype).merge(assignee_patents.astype(dtype), 'inner')

In [154]:
trimCPC_current.size

5230656

In [157]:
trimCPC_current1 = trimCPC_current.drop(columns='assignee_id')

In [159]:
trimCPC_current1.to_csv('trimCPC_current_clean.tsv', sep='\t', index=False, encoding='utf-8')

Unnamed: 0,patent_id,section_id,subsection_id,group_id,subgroup_id
0,3930729,G,G01,G01B,G01B9/02097
1,3930729,G,G01,G01B,G01B2290/25
2,3930857,G,G03,G03F,G03F1/54
3,3930857,G,G03,G03F,G03F1/50
4,3930857,G,G03,G03F,G03F7/022
5,3930857,G,G03,G03F,G03F7/20
6,3930857,Y,Y10,Y10S,Y10S438/948
7,3930857,Y,Y10,Y10S,Y10S438/98
8,3930870,H,H01,H01L,H01L21/30625
9,3931435,G,G03,G03F,G03F7/039


## group
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [16]:
# original data
dataGroup = pd.read_csv('cpc_group.tsv', sep='\t', header=0)
dfGroup = pd.DataFrame(dataGroup)
dfGroup.head()

Unnamed: 0,id,title
0,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
1,A01C,PLANTING; SOWING; FERTILISING
2,A01D,HARVESTING; MOWING
3,A01F,PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW ...
4,A01G,"HORTICULTURE; CULTIVATION OF VEGETABLES, FLOWE..."


In [50]:
# clean table and printing out to csv
group = pd.DataFrame(dfGroup).copy(deep=True)
group.to_csv('group_clean.tsv', index=False, sep='\t', encoding='utf-8')
group.head()

Unnamed: 0,id,title
0,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
1,A01C,PLANTING; SOWING; FERTILISING
2,A01D,HARVESTING; MOWING
3,A01F,PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW ...
4,A01G,"HORTICULTURE; CULTIVATION OF VEGETABLES, FLOWE..."


## subgroup
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [18]:
# original data
dataSubgroup = pd.read_csv('cpc_subgroup.tsv', sep='\t', header=0)
dfSubgroup = pd.DataFrame(dataSubgroup)
dfSubgroup.head()

Unnamed: 0,id,title
0,A01B1/00,Hand tools
1,A01B1/02,Hand tools -Spades; Shovels
2,A01B1/022,Hand tools -Spades; Shovels -Collapsible; exte...
3,A01B1/024,Hand tools -Spades; Shovels -Foot protectors a...
4,A01B1/026,Hand tools -Spades; Shovels -with auxiliary ha...


In [51]:
# clean table and printing out to csv
subgroup = pd.DataFrame(dfSubgroup).copy(deep=True)
subgroup.to_csv('subgroup_clean.tsv', index=False, sep='\t', encoding='utf-8')
subgroup.head()

Unnamed: 0,id,title
0,A01B1/00,Hand tools
1,A01B1/02,Hand tools -Spades; Shovels
2,A01B1/022,Hand tools -Spades; Shovels -Collapsible; exte...
3,A01B1/024,Hand tools -Spades; Shovels -Foot protectors a...
4,A01B1/026,Hand tools -Spades; Shovels -with auxiliary ha...


## subsection
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [23]:
# original data
dataSubsection = pd.read_csv('cpc_subsection.tsv', sep='\t', header=0)
dfSubsection = pd.DataFrame(dataSubsection)
dfSubsection.head()

Unnamed: 0,id,title
0,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
1,A21,BAKING; EDIBLE DOUGHS
2,A22,BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY...
3,A23,"FOODS OR FOODSTUFFS; THEIR TREATMENT, NOT COVE..."
4,A24,TOBACCO; CIGARS; CIGARETTES; SMOKERS' REQUISITES


In [52]:
# clean table and printing out to csv
subsection = pd.DataFrame(dfSubsection).copy(deep=True)
subsection.to_csv('subsection_clean.tsv', index=False, sep='\t', encoding='utf-8')

subsection.head()

Unnamed: 0,id,title
0,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
1,A21,BAKING; EDIBLE DOUGHS
2,A22,BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY...
3,A23,"FOODS OR FOODSTUFFS; THEIR TREATMENT, NOT COVE..."
4,A24,TOBACCO; CIGARS; CIGARETTES; SMOKERS' REQUISITES


## patent
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [163]:
# original data
dataPatent = pd.read_csv('patent.tsv', sep="\t", header=0, error_bad_lines=False)
dfPatent = pd.DataFrame(dataPatent)
dfPatent.head()

b'Skipping line 4243120: expected 11 fields, saw 12\n'
b'Skipping line 4277941: expected 11 fields, saw 12\nSkipping line 4308329: expected 11 fields, saw 12\n'
b'Skipping line 4348258: expected 11 fields, saw 12\nSkipping line 4390841: expected 11 fields, saw 12\n'
b'Skipping line 4400719: expected 11 fields, saw 12\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,type,number,country,date,abstract,title,kind,num_claims,filename,withdrawn
0,3930271,utility,3930271,US,1976-01-06,A golf glove is disclosed having an extra fin...,Golf glove,A,4.0,pftaps19760106_wk01.zip,0.0
1,3930272,utility,3930272,US,1976-01-06,A lock for a height-adjustable crib or plaype...,Crib leg lock,A,3.0,pftaps19760106_wk01.zip,0.0
2,3930273,utility,3930273,US,1976-01-06,A bed safety side rail arrangement which incl...,Bed safety side rail arrangement,A,24.0,pftaps19760106_wk01.zip,0.0
3,3930274,utility,3930274,US,1976-01-06,The assembly includes a longitudinal axis and...,Assembly for use in recreational activities,A,7.0,pftaps19760106_wk01.zip,0.0
4,3930275,utility,3930275,US,1976-01-06,A novel slipper and its method of fabrication...,Method of fabricating a slipper,A,9.0,pftaps19760106_wk01.zip,0.0


In [167]:
# clean table and printing out to csv
patent = pd.DataFrame(dfPatent).copy(deep=True)
patent = patent[patent['country'] == 'US']
patent = patent.drop(columns=['type','country','kind','filename','withdrawn'])
# patent.to_csv('patent_clean.tsv', index=False, sep='\t', encoding='utf-8')
# patent.head()

In [172]:
dtype1 = dict(id=str)
dtype2 = dict(patent_id=str)
trimPatent = pd.merge(patent.astype(dtype1), assignee_patents.astype(dtype2), left_on='id', right_on='patent_id', how='inner')
trimPatent1 = trimPatent.drop(columns='assignee_id')
trimPatent1.to_csv('trimPatent_clean.tsv', index=False, sep='\t', encoding='utf-8')

In [174]:
trimPatent1

Unnamed: 0,id,number,date,abstract,title,num_claims,patent_id
0,3930729,3930729,1976-01-06,An interferometer arrangement of either the t...,Interferometer apparatus incorporating a spher...,39.0,3930729
1,3930857,3930857,1976-01-06,"A resist mask, whose configuration is changed...",Resist process,7.0,3930857
2,3930870,3930870,1976-01-06,An improved process for preparing a polishing...,Silicon polishing solution preparation,16.0,3930870
3,3931435,3931435,1976-01-06,Very sensitive electron beam positive resists...,Electron beam positive resists containing acet...,5.0,3931435
4,3931531,3931531,1976-01-06,A counter circuit counts all transitions of t...,Overlapped signal transition counter,7.0,3931531
5,3931555,3931555,1976-01-06,A digital pulse source generates a frequency ...,Acceleration control system for a d-c motor,9.0,3931555
6,3931639,3931639,1976-01-06,A track or stripe tape positioner for a helic...,Transverse track location device with calibrat...,8.0,3931639
7,3931645,3931645,1976-01-06,A flexible disk file signal storage apparatus...,Flexible disk storage apparatus having disks o...,11.0,3931645
8,3931880,3931880,1976-01-13,Document handling apparatus for use in connec...,Document handling apparatus,10.0,3931880
9,3932026,3932026,1976-01-13,A display assembly in which a nematic liquid ...,Liquid crystal display assembly having dielect...,18.0,3932026


## location
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [55]:
# original data
dataLocation = pd.read_csv('location2.tsv', sep="\t", header=0)
dfLocation = pd.DataFrame(dataLocation)
dfLocation.head()

Unnamed: 0,id,city,country,latitude,longitude
0,0009wn7out97,Kaiserslautern,DE,49.4123,7.69879
1,000fg4sjgke4,Kiryat Ekron,IL,31.8575,34.8223
2,000wvaottuxr,Oberostendorf,DE,47.9437,10.7433
3,000yomlxfffl,Spruce Pine,US,35.9153,-82.0647
4,000zaps28vbi,Bridge City Westwego,US,29.9179,-90.1663


In [57]:
# clean table and printing out to csv
location = pd.DataFrame(dfLocation).copy(deep=True)
location = location[location['country']=='US']
location = location.drop(columns=['country'])
location.to_csv('location_clean.tsv', index=False, sep='\t', encoding='utf-8')
location.head()

Unnamed: 0,id,city,latitude,longitude
3,000yomlxfffl,Spruce Pine,35.9153,-82.0647
4,000zaps28vbi,Bridge City Westwego,29.9179,-90.1663
5,002ui5ctlmse,Dahlonega,34.5261,-83.9844
7,0043o8y61p3z,Rancho Bernardo,33.0186,-117.06
17,00a4lw6chrti,Levering,45.6358,-84.787


## location_assignee
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [38]:
# original data
dataLocationAssignee = pd.read_csv('location_assignee.tsv', sep="\t", header=0)
dfLocationAssignee = pd.DataFrame(dataLocationAssignee)
dfLocationAssignee.head()

Unnamed: 0,location_id,assignee_id
0,wy09e6twn0s1,eaa92f175be7bfb71011f17eafb1e71f
1,406gqa22ukdm,e572ad43a89039b0d72acc4ce970a33f
2,lulvs12ykwd7,8ce825a978eebf26ad2c13de6e370bb3
3,tdk0ut5vx9ki,6c00cb129070696ef109f6264da00318
4,rfmxwk4iedfc,dabf354c29a6ebba31f54b9ed042241d


In [53]:
# clean table and printing out to csv
location_assignee = pd.DataFrame(dfLocationAssignee).copy(deep=True)
location_assignee.to_csv('location_assignee_clean.csv', index=False, sep=',', encoding='utf-8')
location_assignee.head()

Unnamed: 0,location_id,assignee_id
0,wy09e6twn0s1,eaa92f175be7bfb71011f17eafb1e71f
1,406gqa22ukdm,e572ad43a89039b0d72acc4ce970a33f
2,lulvs12ykwd7,8ce825a978eebf26ad2c13de6e370bb3
3,tdk0ut5vx9ki,6c00cb129070696ef109f6264da00318
4,rfmxwk4iedfc,dabf354c29a6ebba31f54b9ed042241d


## us_term_of_grant
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [patent](#patent)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [29]:
dataExpired = pd.read_csv('us_term_of_grant.tsv', sep="\t", header=0)
dfExpired = pd.DataFrame(dataExpired)
dfExpired.head()

Unnamed: 0,uuid,patent_id,lapse_of_patent,disclaimer_date,term_disclaimer,term_grant,term_extension
0,0000hq4800wcm85ghvyz34mbe,D657425,,0000-00-00,,14.0,
1,0002e24jgnd6uspokbvss6xex,D699845,,0000-00-00,,14.0,
2,0002s1xi2xuinyor7g5w5veu6,D525308,,0000-00-00,,14.0,
3,000315mx91ci258k7nwyve9pg,9193114,,0000-00-00,,,408.0
4,0003hd31mgyh3f2x7j8stmuq6,D532925,,0000-00-00,,14.0,


In [54]:
# clean table and printing out to csv
us_term_of_grant = pd.DataFrame(dfExpired).copy(deep=True)
us_term_of_grant = us_term_of_grant.drop(columns=['uuid',
                                                  'lapse_of_patent',
                                                  'term_disclaimer',
                                                  'term_grant',
                                                  'term_extension'])
us_term_of_grant.to_csv('us_term_of_grant_clean.csv', index=False, sep=',', encoding='utf-8')
us_term_of_grant.head()

Unnamed: 0,patent_id,disclaimer_date
0,D657425,0000-00-00
1,D699845,0000-00-00
2,D525308,0000-00-00
3,9193114,0000-00-00
4,D532925,0000-00-00


In [85]:
# dfExpired[dfExpired['disclaimer_date']!='0000-00-00'].head(50)

Unnamed: 0,uuid,patent_id,lapse_of_patent,disclaimer_date,term_disclaimer,term_grant,term_extension
164,002aq634gkvojyzmxlz8g0wno,D334422,,2005-03-12,,14.0,
172,002cmto548ezdiejy8i2a28qq,5089490,,2005-07-19,,,
182,002h7kmntnf61xhsn3mcrj4al,4805394,,2002-11-21,,,
249,003gr6ne1h4286un48q7mr0b1,D387548,,2011-11-11,,14.0,
268,003omfcsvfmp4lvqyymew3ai0,4591481,,2001-02-14,,,
313,004dvl0nxz3kwgzs5qe4fazf5,5216281,,2008-08-27,,,
385,005gbradp0hvjubfcfz9tjq3k,4764874,,2005-05-03,,,
514,007gbj8mt28y1lxo5cvjygjy8,5229026,,2008-10-01,,,
606,008rm9t62n6as0bjthe31xgay,D360500,,2009-06-20,,14.0,
674,009s40sos1n7ikkqzfrq7i3zx,5468151,,2011-07-19,,,
