In [1]:
# Script to clean uspatentcitation.tsv

# Jan 20th, 2020
# There are citation_ids larger than 7 characters and smaller than 4
# Larger are usually applications, smallers tend to be errors
# I am keeping than so calculations on forward citations are accurate
# When matching by citation_id, it must be previously filtered

# as of Jan 9th, 2020, there are entries to be evaluated
# for now, error_bad_lines=False skips those entries


In [2]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re

In [3]:
# uspatentcitation.tsv
# uuid:         unique id
# patent_id:    patent number
# citation_id:  identifying number of patent to which select patent cites
# date:         date select patent (patent_id) cites patent (citation_id)
# name:         name of cited record
# kind:         WIPO document kind codes 
#               (http://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent)	2002 and After
# country:      country cited patent was granted (always US)
# category:     who cited the patent (examiner, applicant, other etc) - 2002 and After
# sequence:     order in which this reference is cited by select patent	all


In [4]:
src= r'/home/rkogeyam/PATENT_CITATION/data/uspatentcitation.tsv'
dst= '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv'

In [5]:
df=pd.read_csv(src, sep='\t', error_bad_lines=False)

In [6]:
df=df.astype(str)

In [7]:
df.dtypes

uuid           object
patent_id      object
citation_id    object
date           object
name           object
kind           object
country        object
category       object
sequence       object
dtype: object

In [8]:
%%time
# Keep this for reference!
# As of Dec 31st, 2019, I compared the clean to the raw version of citation and patent ids

# stripping non-desired characters but keeping the originals for later check - only three changes in citation_id

df['citation_id_raw']=df['citation_id']
df['patent_id_raw']=df['patent_id']

cleaning_patent=lambda x:re.sub('([^a-zA-Z0-9]+)', "", x)
df['citation_id']=df['citation_id'].apply(cleaning_patent)
df['patent_id']=df['patent_id'].apply(cleaning_patent)

# # #stripping trailing white spaces - no changes
# # df['patent_id'] = df['patent_id'].str.strip()
# # df['citation_id'] = df['citation_id'].str.strip()

# # #this is taking a lot of time, evaluate alternatives



CPU times: user 8min 45s, sys: 257 ms, total: 8min 46s
Wall time: 8min 46s


In [9]:
print(df[df['patent_id']!=df['patent_id_raw']][['patent_id','patent_id_raw']])

         patent_id patent_id_raw
595502     H001852      H001852&
624790     H001821      H001821&
681435     H001947      H001947&
1610007    H001947      H001947&
1629796    H001883      H001883&
3062168    H001981      H001981&
3971041    H001947      H001947&
4458085    H001852      H001852&
4525149    H002001      H002001&
5071521    H001995      H001995&
5275761    H001852      H001852&
5322903    H001933      H001933&
5497615    H001947      H001947&
7709849    H001821      H001821&
9119648    H002001      H002001&
9465686    H001852      H001852&
10545440   H002001      H002001&
11598389   H001852      H001852&
11633905   H001852      H001852&
15333293   H001947      H001947&
16662434   H002001      H002001&
18387904   H001902      H001902&
18576053   H001947      H001947&
19283469   H001852      H001852&
19487067   H001947      H001947&
19544571   H001852      H001852&
20034969   H001835      H001835&
21713987   H002001      H002001&
22057645   H001883      H001883&
22789344  

In [10]:
print(df[df['citation_id']!=df['citation_id_raw']][['citation_id', 'citation_id_raw']])

          citation_id       citation_id_raw
30            3594826  3594826             
42            5575539  5575539             
50            5393739  5393739             
123           4633788  4633788             
129           4956694  4956694             
190           5078456  5078456             
205           5154116  5154116             
226           4012211  4012211             
510           4809159  4809159             
550           4509739  4509739             
714           4829570  4829570             
747           5087325  5087325             
794           2201196  2201196             
874           5379172  5379172             
876           5022695  5022695             
888           5216649  5216649             
934           5287384  5287384             
968           4882406  4882406             
1049          4874376  4874376             
1113          5005928  5005928             
1115      20020090489          2002/0090489
1133          5327914  5327914  

In [11]:
df[
df['citation_id'].apply(lambda x: len(x)>7)
  ]

Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence,citation_id_raw,patent_id_raw
1115,000l397rmcu1p9fxvegt75gd8,6834474,20020090489,2002-07-01,Dobreski et al.,A1,US,cited by other,192,2002/0090489,6834474
1124,000l6y5a355o1xp8kw12zat2q,6854492,20010025668,2001-10-01,Enge,A1,US,cited by other,5,20010025668,6854492
3978,0021zkyqbzv8dp89k0e9i5pvn,D474084,20010013500,2001-08-01,Gilley et al.,A1,US,cited by examiner,26,2001/0013500,D474084
4915,002jwbj8ch96kavoxjhkw8czo,6738820,20020031120,2002-03-01,Rakib,A1,US,cited by examiner,5,2002/0031120,6738820
4990,002ld67k6ty2it7oo6legcqh5,6862552,20020034191,2002-03-01,Shattil,A1,US,cited by examiner,0,20020034191,6862552
5011,002lpkh2rawkxxp7gm2q3ye8u,6679612,20030002180,2003-01-01,Nielsen et al.,A1,US,cited by examiner,11,2003/0002180,6679612
5183,002omicihgubu2nt4x2kpzakb,6614073,20020053696,2002-05-01,Iwamuro et al.,A1,US,cited by examiner,18,2002/0053696,6614073
6217,0037xjdpdapjlidqzlgh9gqop,6622391,20020153479,2002-10-01,Kenjo et al.,A1,US,cited by examiner,29,2002/0153479,6622391
7691,003zfsqsbkkitu3ccrb94glu8,6809520,20020030492,2002-03-01,Guo et al.,A1,US,cited by other,13,2002/0030492,6809520
10455,005dy0m3n9516z5eepmi2crm3,6822677,20010009438,2001-07-01,Kihara et al .,A1,US,cited by examiner,9,2001/0009438,6822677


In [12]:
df[
df['citation_id'].apply(lambda x: len(x)<4)
  ]

Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence,citation_id_raw,patent_id_raw
135629,01yygjyvi1yrylgtc4rqpoc4u,8338788,H12,1986-01-01,Bennett et al.,H,US,cited by other,12,H12,8338788
216087,034z4oev381wzmgq5eh1qfdhu,7005209,H16,1986-01-01,Kaun,H,US,cited by other,5,H16,7005209
268873,03wkd98jd0lodjm1r5kvkev2t,6402703,H93,1986-07-01,Matta et al.,H,US,cited by other,78,H93,6402703
665848,09op18mr8idwopt8bol93xihs,7425075,113,1837-01-01,Austin,A,US,cited by other,0,113,7425075
742228,0assgp0zbwyzvjmmoyvn7ozbm,8732893,997,1838-11-01,Sanford,A,US,cited by applicant,1,997,8732893
787895,0bgpq77n77f6i6u0om5hdcy1t,6687642,H81,1986-07-01,Szabo et al.,H,US,cited by examiner,0,H81,6687642
925814,0dh65l353x1fv37wm7crb7zcq,9498414,H83,1986-07-01,Poletto et al.,H,US,cited by applicant,8,H83,9498414
931821,0dkamzf60pcpz0bhks51azg85,4248123,84,0000-00-00,,,US,,0,84,4248123
1015056,0ert25mg1c16n1op0oqcry7ck,7672342,H66,1986-05-01,White,H,US,cited by examiner,2,H66,7672342
1150418,0gqjrz52f6oaifcjk1xappi3q,7951236,1,1836-07-01,Ruggles,A,US,cited by other,0,1,7951236


In [13]:
# df.to_csv(dst)