# Data Cleaning

Navigate to:  [acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

## Summary

 - acquisitions = "acquisitions_clean.csv"
 - assignee = "assignee_clean.csv"
 - cpc_current = "cpc_current_clean.csv"
 - group = "group_clean.tsv"  *
 - subgroup = "subgroup_clean.tsv"   *
 - subsection = "subsection_clean.tsv"  *
 - location = "location_clean.tsv"  *
 - location_assignee = "location_assignee_clean.csv"
 - us_term_of_grant = "us_term_of_grant_clean.csv" <br>
 <i>* .tsv because one or more columns contain commas </i>

companies = ["Google", "Microsoft", "Facebook", "Apple", "Yahoo", "Twitter", "IBM"]

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## acquisitions
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [3]:
# original data
dataAcquisitions = pd.read_csv('acquisitions.csv', header=0)
dfAcquisitions = pd.DataFrame(dataAcquisitions)
dfAcquisitions.head()

Unnamed: 0,AcquisitionID,AcquisitionMonth,AcquisitionMonthDate,AcquisitionYear,Company,Business,Country,Value (USD),Derived products,ParentCompany
0,ACQ99,November,11.0,2015,bebop,Cloud software,USA,380000000.0,Google Cloud Platform,Google
1,ACQ98,November,11.0,2015,Fly Labs,Video editing,USA,,Google Photos,Google
2,ACQ97,December,8.0,2015,Clearleap,Cloud-based video management,USA,,,IBM
3,ACQ96,December,18.0,2015,Metanautix,Big Data Analytics,USA,,,Microsoft
4,ACQ95,December,21.0,2015,"Talko, Inc.",Mobile communications,USA,,,Microsoft


In [46]:
# clean table and printing out to csv
acquisitions = pd.DataFrame(dfAcquisitions).copy(deep=True)
acquisitions = acquisitions.drop(columns=['AcquisitionID',
                                          'AcquisitionMonth',
                                          'AcquisitionMonthDate',
                                          'Country',
                                          'Business',
                                          'Value (USD)',
                                          'Derived products'])
acquisitions = acquisitions.rename(columns={"Company": "ChildCompany"})
acquisitions.to_csv('acquisitions_clean.csv', index=False, sep=',', encoding='utf-8')
acquisitions.head()

Unnamed: 0,AcquisitionYear,ChildCompany,ParentCompany
0,2015,bebop,Google
1,2015,Fly Labs,Google
2,2015,Clearleap,IBM
3,2015,Metanautix,Microsoft
4,2015,"Talko, Inc.",Microsoft


## application
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [5]:
# original data
dataApplication = pd.read_csv('application.tsv', sep="\t", header=0)
dfApplication = pd.DataFrame(dataApplication)
dfApplication.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,patent_id,series_code,number,country,date
0,02/002761,D345393,2,2002761,US,1992-12-21
1,02/007691,5164715,2,2007691,US,1990-04-10
2,02/010248,5177974,2,2010248,US,1988-06-23
3,02/020141,5379515,2,2020141,US,1994-02-16
4,02/027172,5264790,2,2027172,US,1991-07-01


In [47]:
# clean table and printing out to csv
application = pd.DataFrame(dfApplication).copy(deep=True)
application = application.drop(columns=['id',
                                        'series_code',
                                        'number'])
application.to_csv('application_clean.csv', index=False, sep=',', encoding='utf-8')
application.head()

Unnamed: 0,patent_id,country,date
0,D345393,US,1992-12-21
1,5164715,US,1990-04-10
2,5177974,US,1988-06-23
3,5379515,US,1994-02-16
4,5264790,US,1991-07-01


## assignee
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [12]:
dataAssignee = pd.read_csv('rawassignee.tsv', sep='\t', header=0)
dfAssignee = pd.DataFrame(dataAssignee)
dfAssignee.head()

Unnamed: 0,uuid,patent_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,sequence
0,0000p94wkezw94s8cz7dbxlvz,5856666,eaa92f175be7bfb71011f17eafb1e71f,orskbf54s58e97lkmw8na5rpx,2,,,U.S. Philips Corporation,0
1,00013vk881wap9u4mbo7lwwhp,5204210,e572ad43a89039b0d72acc4ce970a33f,mue862v5lcjdhzqqk86ei75kj,2,,,Xerox Corporation,0
2,000192sn2u10kzpikl4s7h3r0,5302149,8ce825a978eebf26ad2c13de6e370bb3,o1h9dqdv0yq7dt1b1vmrcal9h,3,,,Commonwealth Scientific & Industrial Research ...,1
3,0001ycvv6sz1ju07ss99nhxi1,9104354,6c00cb129070696ef109f6264da00318,rspbpqcajvm09r1ew9mgnpx37,3,,,Canon Kabushiki Kaisha,0
4,0001z7ws4m14aqdb3tv99u550,6584517,dabf354c29a6ebba31f54b9ed042241d,l1gyelp5jcg0hakk9smmhsdgr,2,,,Cypress Semiconductor Corp.,0


In [58]:
# clean table and printing out to csv
assignee = pd.DataFrame(dfAssignee).copy(deep=True)
assignee = assignee.drop(columns=['uuid',
                                  'type',
                                  'name_first',
                                  'name_last',
                                  'sequence'])
assignee.to_csv('assignee_clean.tsv', index=False, sep='\t', encoding='utf-8')
assignee.head()

Unnamed: 0,patent_id,assignee_id,organization
0,5856666,eaa92f175be7bfb71011f17eafb1e71f,U.S. Philips Corporation
1,5204210,e572ad43a89039b0d72acc4ce970a33f,Xerox Corporation
2,5302149,8ce825a978eebf26ad2c13de6e370bb3,Commonwealth Scientific & Industrial Research ...
3,9104354,6c00cb129070696ef109f6264da00318,Canon Kabushiki Kaisha
4,6584517,dabf354c29a6ebba31f54b9ed042241d,Cypress Semiconductor Corp.


## cpc_current
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [14]:
dataCPC = pd.read_csv('cpc_current.tsv', sep='\t', header=0)
dfCPC = pd.DataFrame(dataCPC)
dfCPC.head()

Unnamed: 0,uuid,patent_id,section_id,subsection_id,group_id,subgroup_id,category,sequence
0,pkscb392v7o62jgv5e3tdyct2,3930272,A,A47,A47D,A47D7/02,inventional,0
1,s3x865wlce64sbxds2ubgheu4,3930272,Y,Y10,Y10T,Y10T403/32451,additional,1
2,638qht0an0gg7dc1ub3yeexm2,3930273,A,A61,A61G,A61G7/0507,inventional,0
3,34qqyersco0qh8jsn9bhdxy88,3930273,A,A61,A61G,A61G7/0509,inventional,1
4,sefi1no1nlmw0jjn82s68bhmq,3930274,B,B63,B63B,B63B7/085,inventional,0


In [49]:
# creating "patents" table and printing out to csv
cpc_current = pd.DataFrame(dfCPC).copy(deep=True)
cpc_current = cpc_current.drop(columns=['uuid',
                                        'category',
                                        'sequence'])
cpc_current.to_csv('cpc_current_clean.csv', index=False, sep=',', encoding='utf-8')
cpc_current.head()

Unnamed: 0,patent_id,section_id,subsection_id,group_id,subgroup_id
0,3930272,A,A47,A47D,A47D7/02
1,3930272,Y,Y10,Y10T,Y10T403/32451
2,3930273,A,A61,A61G,A61G7/0507
3,3930273,A,A61,A61G,A61G7/0509
4,3930274,B,B63,B63B,B63B7/085


## group
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [16]:
# original data
dataGroup = pd.read_csv('cpc_group.tsv', sep='\t', header=0)
dfGroup = pd.DataFrame(dataGroup)
dfGroup.head()

Unnamed: 0,id,title
0,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
1,A01C,PLANTING; SOWING; FERTILISING
2,A01D,HARVESTING; MOWING
3,A01F,PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW ...
4,A01G,"HORTICULTURE; CULTIVATION OF VEGETABLES, FLOWE..."


In [50]:
# clean table and printing out to csv
group = pd.DataFrame(dfGroup).copy(deep=True)
group.to_csv('group_clean.tsv', index=False, sep='\t', encoding='utf-8')
group.head()

Unnamed: 0,id,title
0,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
1,A01C,PLANTING; SOWING; FERTILISING
2,A01D,HARVESTING; MOWING
3,A01F,PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW ...
4,A01G,"HORTICULTURE; CULTIVATION OF VEGETABLES, FLOWE..."


## subgroup
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [18]:
# original data
dataSubgroup = pd.read_csv('cpc_subgroup.tsv', sep='\t', header=0)
dfSubgroup = pd.DataFrame(dataSubgroup)
dfSubgroup.head()

Unnamed: 0,id,title
0,A01B1/00,Hand tools
1,A01B1/02,Hand tools -Spades; Shovels
2,A01B1/022,Hand tools -Spades; Shovels -Collapsible; exte...
3,A01B1/024,Hand tools -Spades; Shovels -Foot protectors a...
4,A01B1/026,Hand tools -Spades; Shovels -with auxiliary ha...


In [51]:
# clean table and printing out to csv
subgroup = pd.DataFrame(dfSubgroup).copy(deep=True)
subgroup.to_csv('subgroup_clean.tsv', index=False, sep='\t', encoding='utf-8')
subgroup.head()

Unnamed: 0,id,title
0,A01B1/00,Hand tools
1,A01B1/02,Hand tools -Spades; Shovels
2,A01B1/022,Hand tools -Spades; Shovels -Collapsible; exte...
3,A01B1/024,Hand tools -Spades; Shovels -Foot protectors a...
4,A01B1/026,Hand tools -Spades; Shovels -with auxiliary ha...


## subsection
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [23]:
# original data
dataSubsection = pd.read_csv('cpc_subsection.tsv', sep='\t', header=0)
dfSubsection = pd.DataFrame(dataSubsection)
dfSubsection.head()

Unnamed: 0,id,title
0,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
1,A21,BAKING; EDIBLE DOUGHS
2,A22,BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY...
3,A23,"FOODS OR FOODSTUFFS; THEIR TREATMENT, NOT COVE..."
4,A24,TOBACCO; CIGARS; CIGARETTES; SMOKERS' REQUISITES


In [52]:
# clean table and printing out to csv
subsection = pd.DataFrame(dfSubsection).copy(deep=True)
subsection.to_csv('subsection_clean.tsv', index=False, sep='\t', encoding='utf-8')

subsection.head()

Unnamed: 0,id,title
0,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
1,A21,BAKING; EDIBLE DOUGHS
2,A22,BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY...
3,A23,"FOODS OR FOODSTUFFS; THEIR TREATMENT, NOT COVE..."
4,A24,TOBACCO; CIGARS; CIGARETTES; SMOKERS' REQUISITES


## location
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [55]:
# original data
dataLocation = pd.read_csv('location2.tsv', sep="\t", header=0)
dfLocation = pd.DataFrame(dataLocation)
dfLocation.head()

Unnamed: 0,id,city,country,latitude,longitude
0,0009wn7out97,Kaiserslautern,DE,49.4123,7.69879
1,000fg4sjgke4,Kiryat Ekron,IL,31.8575,34.8223
2,000wvaottuxr,Oberostendorf,DE,47.9437,10.7433
3,000yomlxfffl,Spruce Pine,US,35.9153,-82.0647
4,000zaps28vbi,Bridge City Westwego,US,29.9179,-90.1663


In [57]:
# clean table and printing out to csv
location = pd.DataFrame(dfLocation).copy(deep=True)
location = location[location['country']=='US']
location = location.drop(columns=['country'])
location.to_csv('location_clean.tsv', index=False, sep='\t', encoding='utf-8')
location.head()

Unnamed: 0,id,city,latitude,longitude
3,000yomlxfffl,Spruce Pine,35.9153,-82.0647
4,000zaps28vbi,Bridge City Westwego,29.9179,-90.1663
5,002ui5ctlmse,Dahlonega,34.5261,-83.9844
7,0043o8y61p3z,Rancho Bernardo,33.0186,-117.06
17,00a4lw6chrti,Levering,45.6358,-84.787


## location_assignee
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [38]:
# original data
dataLocationAssignee = pd.read_csv('location_assignee.tsv', sep="\t", header=0)
dfLocationAssignee = pd.DataFrame(dataLocationAssignee)
dfLocationAssignee.head()

Unnamed: 0,location_id,assignee_id
0,wy09e6twn0s1,eaa92f175be7bfb71011f17eafb1e71f
1,406gqa22ukdm,e572ad43a89039b0d72acc4ce970a33f
2,lulvs12ykwd7,8ce825a978eebf26ad2c13de6e370bb3
3,tdk0ut5vx9ki,6c00cb129070696ef109f6264da00318
4,rfmxwk4iedfc,dabf354c29a6ebba31f54b9ed042241d


In [53]:
# clean table and printing out to csv
location_assignee = pd.DataFrame(dfLocationAssignee).copy(deep=True)
location_assignee.to_csv('location_assignee_clean.csv', index=False, sep=',', encoding='utf-8')
location_assignee.head()

Unnamed: 0,location_id,assignee_id
0,wy09e6twn0s1,eaa92f175be7bfb71011f17eafb1e71f
1,406gqa22ukdm,e572ad43a89039b0d72acc4ce970a33f
2,lulvs12ykwd7,8ce825a978eebf26ad2c13de6e370bb3
3,tdk0ut5vx9ki,6c00cb129070696ef109f6264da00318
4,rfmxwk4iedfc,dabf354c29a6ebba31f54b9ed042241d


## us_term_of_grant
[acquisitions](#acquisitions)
 | [application](#application)
 | [assignee](#assignee) 
 | [cpc_current](#cpc_current)
 | [group](#group)
 | [subgroup](#subgroup)
 | [subsection](#subsection)
 | [location](#location)
 | [location_assignee](#location_assignee)
 | [us_term_of_grant](#us_term_of_grant)

In [29]:
dataExpired = pd.read_csv('us_term_of_grant.tsv', sep="\t", header=0)
dfExpired = pd.DataFrame(dataExpired)
dfExpired.head()

Unnamed: 0,uuid,patent_id,lapse_of_patent,disclaimer_date,term_disclaimer,term_grant,term_extension
0,0000hq4800wcm85ghvyz34mbe,D657425,,0000-00-00,,14.0,
1,0002e24jgnd6uspokbvss6xex,D699845,,0000-00-00,,14.0,
2,0002s1xi2xuinyor7g5w5veu6,D525308,,0000-00-00,,14.0,
3,000315mx91ci258k7nwyve9pg,9193114,,0000-00-00,,,408.0
4,0003hd31mgyh3f2x7j8stmuq6,D532925,,0000-00-00,,14.0,


In [54]:
# clean table and printing out to csv
us_term_of_grant = pd.DataFrame(dfExpired).copy(deep=True)
us_term_of_grant = us_term_of_grant.drop(columns=['uuid',
                                                  'lapse_of_patent',
                                                  'term_disclaimer',
                                                  'term_grant',
                                                  'term_extension'])
us_term_of_grant.to_csv('us_term_of_grant_clean.csv', index=False, sep=',', encoding='utf-8')
us_term_of_grant.head()

Unnamed: 0,patent_id,disclaimer_date
0,D657425,0000-00-00
1,D699845,0000-00-00
2,D525308,0000-00-00
3,9193114,0000-00-00
4,D532925,0000-00-00


In [85]:
# dfExpired[dfExpired['disclaimer_date']!='0000-00-00'].head(50)

Unnamed: 0,uuid,patent_id,lapse_of_patent,disclaimer_date,term_disclaimer,term_grant,term_extension
164,002aq634gkvojyzmxlz8g0wno,D334422,,2005-03-12,,14.0,
172,002cmto548ezdiejy8i2a28qq,5089490,,2005-07-19,,,
182,002h7kmntnf61xhsn3mcrj4al,4805394,,2002-11-21,,,
249,003gr6ne1h4286un48q7mr0b1,D387548,,2011-11-11,,14.0,
268,003omfcsvfmp4lvqyymew3ai0,4591481,,2001-02-14,,,
313,004dvl0nxz3kwgzs5qe4fazf5,5216281,,2008-08-27,,,
385,005gbradp0hvjubfcfz9tjq3k,4764874,,2005-05-03,,,
514,007gbj8mt28y1lxo5cvjygjy8,5229026,,2008-10-01,,,
606,008rm9t62n6as0bjthe31xgay,D360500,,2009-06-20,,14.0,
674,009s40sos1n7ikkqzfrq7i3zx,5468151,,2011-07-19,,,
