In [1]:
# Script to clean uspatentcitation.tsv

# Feb 7th, 2020
# I am adding a cut for citation date
# it cannot be older than 07-31-1790 (the first patent ever issued)

# Jan 20th, 2020
# There are citation_ids larger than 7 characters and smaller than 4
# Larger are usually applications, smallers tend to be errors
# I am keeping than so calculations on forward citations are accurate
# When matching by citation_id, it must be previously filtered

# as of Jan 9th, 2020, there are entries to be evaluated
# for now, error_bad_lines=False skips those entries


In [2]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re
import datetime

In [3]:
# uspatentcitation.tsv
# uuid:         unique id
# patent_id:    patent number
# citation_id:  identifying number of patent to which select patent cites
# date:         date select patent (patent_id) cites patent (citation_id)
# name:         name of cited record
# kind:         WIPO document kind codes 
#               (http://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent)	2002 and After
# country:      country cited patent was granted (always US)
# category:     who cited the patent (examiner, applicant, other etc) - 2002 and After
# sequence:     order in which this reference is cited by select patent	all


In [4]:
src= r'/home/rkogeyam/PATENT_CITATION/data/uspatentcitation.tsv'
dst= '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv'

In [5]:
df=pd.read_csv(src, sep='\t', error_bad_lines=False)

In [6]:
df=df.astype(str)

In [7]:
df.dtypes

uuid           object
patent_id      object
citation_id    object
date           object
name           object
kind           object
country        object
category       object
sequence       object
dtype: object

In [8]:
%%time
#stripping trailing white spaces - no changes
df['patent_id'] = df['patent_id'].str.strip()
df['citation_id'] = df['citation_id'].str.strip()

CPU times: user 1min 10s, sys: 96 ms, total: 1min 10s
Wall time: 1min 10s


In [9]:
%%time
# Keep this for reference!
# As of Dec 31st, 2019, I compared the clean to the raw version of citation and patent ids

# stripping non-desired characters but keeping the originals for later check - only three changes in citation_id

df['citation_id_raw']=df['citation_id']
df['patent_id_raw']=df['patent_id']

CPU times: user 1.59 s, sys: 6 ms, total: 1.6 s
Wall time: 1.59 s


In [10]:
%%time
cleaning_patent=lambda x:re.sub('([^a-zA-Z0-9]+)', "", x)
df['citation_id']=df['citation_id'].apply(cleaning_patent)

# # #this is taking a lot of time, evaluate alternatives

CPU times: user 3min 54s, sys: 18 ms, total: 3min 54s
Wall time: 3min 54s


In [11]:
%%time
df['patent_id']=df['patent_id'].apply(cleaning_patent)


CPU times: user 3min 57s, sys: 27 ms, total: 3min 57s
Wall time: 3min 57s


In [12]:
df.date.replace({'-00':'-01'}, regex=True, inplace=True)

In [13]:
%%time
df['date']=pd.to_datetime(df['date'], format="%Y-%m-%d", errors='coerce') 
# first_patent = datetime.date(1790, 7, 31)
first_patent = pd.to_datetime('1790-6-31') #I tweaked slightly because there is a citation to the patent n1 that seems correct

CPU times: user 34.6 s, sys: 1.23 s, total: 35.8 s
Wall time: 35.7 s


In [None]:
#as for Aug 11, 2020, two problems arose:
#- some name fields present EOF characters
#- some lines are missing their linebreaks
# this is the correction of these issues
clean_doublespace=lambda x:re.sub('\s\s', '\s', x)
df['name']=df['name'].apply(clean_doublespace)


In [14]:
#citation with dates previous to the inauguration of USPTO office
df[df['date']<first_patent][['patent_id','citation_id','date']]

Unnamed: 0,patent_id,citation_id,date
1277064,6878571,3746934,1773-07-01
1715634,4865278,4161717,1779-07-01
5004878,6194502,1,1790-07-01
6414336,5263595,4629077,1686-12-01
7677827,5341713,3422714,1696-01-01
9335345,5159488,4725898,1688-02-01
10981532,4976377,3855458,1774-12-01
13258508,4586322,4476632,1698-10-01
18297844,D297377,3513500,1790-05-01
18548942,5819410,5174519,1754-05-01


In [15]:
len(df['date']) - df['date'].count()

1769

In [16]:
%%time
#patents with wrong dates should not be dropped
#the best way is to search for the correct date
#i can do in a separated script
#this would improve the results - other citation dates can be bugged

#for now, because there are just a few mistakes, i just exclude the date
#excluding dates from patents with wrong dates
df['date']=df['date'].apply(lambda x: np.nan if x < first_patent else x)

CPU times: user 8min 22s, sys: 6.32 s, total: 8min 29s
Wall time: 8min 28s


In [17]:
len(df['date']) - df['date'].count()

1804

In [18]:
# print(df[df['patent_id']!=df['patent_id_raw']][['patent_id','patent_id_raw']])

In [19]:
# print(df[df['citation_id']!=df['citation_id_raw']][['citation_id', 'citation_id_raw']])

In [20]:
df[df['citation_id'].apply(lambda x: len(x)>7)]

Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence,citation_id_raw,patent_id_raw
1115,000l397rmcu1p9fxvegt75gd8,6834474,20020090489,2002-07-01,Dobreski et al.,A1,US,cited by other,192,2002/0090489,6834474
1124,000l6y5a355o1xp8kw12zat2q,6854492,20010025668,2001-10-01,Enge,A1,US,cited by other,5,20010025668,6854492
3978,0021zkyqbzv8dp89k0e9i5pvn,D474084,20010013500,2001-08-01,Gilley et al.,A1,US,cited by examiner,26,2001/0013500,D474084
4915,002jwbj8ch96kavoxjhkw8czo,6738820,20020031120,2002-03-01,Rakib,A1,US,cited by examiner,5,2002/0031120,6738820
4990,002ld67k6ty2it7oo6legcqh5,6862552,20020034191,2002-03-01,Shattil,A1,US,cited by examiner,0,20020034191,6862552
5011,002lpkh2rawkxxp7gm2q3ye8u,6679612,20030002180,2003-01-01,Nielsen et al.,A1,US,cited by examiner,11,2003/0002180,6679612
5183,002omicihgubu2nt4x2kpzakb,6614073,20020053696,2002-05-01,Iwamuro et al.,A1,US,cited by examiner,18,2002/0053696,6614073
6217,0037xjdpdapjlidqzlgh9gqop,6622391,20020153479,2002-10-01,Kenjo et al.,A1,US,cited by examiner,29,2002/0153479,6622391
7691,003zfsqsbkkitu3ccrb94glu8,6809520,20020030492,2002-03-01,Guo et al.,A1,US,cited by other,13,2002/0030492,6809520
10455,005dy0m3n9516z5eepmi2crm3,6822677,20010009438,2001-07-01,Kihara et al .,A1,US,cited by examiner,9,2001/0009438,6822677


In [21]:
df[df['citation_id'].apply(lambda x: len(x)<4)]

Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence,citation_id_raw,patent_id_raw
135629,01yygjyvi1yrylgtc4rqpoc4u,8338788,H12,1986-01-01,Bennett et al.,H,US,cited by other,12,H12,8338788
216087,034z4oev381wzmgq5eh1qfdhu,7005209,H16,1986-01-01,Kaun,H,US,cited by other,5,H16,7005209
268873,03wkd98jd0lodjm1r5kvkev2t,6402703,H93,1986-07-01,Matta et al.,H,US,cited by other,78,H93,6402703
665848,09op18mr8idwopt8bol93xihs,7425075,113,1837-01-01,Austin,A,US,cited by other,0,113,7425075
742228,0assgp0zbwyzvjmmoyvn7ozbm,8732893,997,1838-11-01,Sanford,A,US,cited by applicant,1,997,8732893
787895,0bgpq77n77f6i6u0om5hdcy1t,6687642,H81,1986-07-01,Szabo et al.,H,US,cited by examiner,0,H81,6687642
925814,0dh65l353x1fv37wm7crb7zcq,9498414,H83,1986-07-01,Poletto et al.,H,US,cited by applicant,8,H83,9498414
931821,0dkamzf60pcpz0bhks51azg85,4248123,84,NaT,,,US,,0,84,4248123
1015056,0ert25mg1c16n1op0oqcry7ck,7672342,H66,1986-05-01,White,H,US,cited by examiner,2,H66,7672342
1150418,0gqjrz52f6oaifcjk1xappi3q,7951236,1,1836-07-01,Ruggles,A,US,cited by other,0,1,7951236


In [22]:
df.citation_id.str.len().value_counts()

7     90573339
6       701086
11      122407
5        45182
4         7393
9         2426
8          693
3          615
2           63
12          40
1           29
10          22
15           1
13           1
Name: citation_id, dtype: int64

In [23]:
df.to_csv(dst)