In [1]:
# Script to clean uspatentcitation.tsv

# Jan 20th, 2020
# There are citation_ids larger than 7 characters and smaller than 4
# Larger are usually applications, smallers tend to be errors
# I am keeping than so calculations on forward citations are accurate
# When matching by citation_id, it must be previously filtered

# as of Jan 9th, 2020, there are entries to be evaluated
# for now, error_bad_lines=False skips those entries


In [2]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re

In [3]:
# patent.csv
# id:       patent this record corresponds to 
# type:     category of patent. Usually "Design", "reissue", etc.
# number:   patent number
# country:  country in which patent was granted (always US)
# date:     date when patent was granted
# abstract: abstract text of patent
# title:    title of patent
# kind:     WIPO document kind codes (http://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent)
# num_claims:number of claims
# filename: name of the raw data file where patent information is parsed from

In [None]:
%matplotlib inline

In [4]:
src= 'data/patent.tsv'
dst= 'data/cleanpatent.csv'

In [5]:
cols=['id', 'num_claims', 'date', 'type', 'kind']

In [6]:
df = pd.read_csv(src, sep='\t', usecols=cols, error_bad_lines=False, dtype=object)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6488267 entries, 0 to 6488266
Data columns (total 5 columns):
id            object
type          object
date          object
kind          object
num_claims    object
dtypes: object(5)
memory usage: 247.5+ MB


In [8]:
df=df.astype(object)

In [9]:
df.dtypes

id            object
type          object
date          object
kind          object
num_claims    object
dtype: object

In [10]:
%%time
# Keep this for reference!
# As of Dec 31st, 2019, I compared the clean to the raw version of citation and patent ids

# stripping non-desired characters but keeping the originals for later check - only three changes in citation_id

cleaning_patent=lambda x:re.sub('([^a-zA-Z0-9]+)', "", x)
df['id']=df['id'].apply(cleaning_patent)

CPU times: user 16.6 s, sys: 3 ms, total: 16.6 s
Wall time: 16.6 s


In [11]:
df.date.replace({'-00':'-01'}, regex=True, inplace=True)
#ideally, I would control the modification here

In [12]:
df.id.str.len().value_counts()

7      6480301
6         5995
8         1966
433          1
136          1
56           1
52           1
47           1
Name: id, dtype: int64

In [13]:
df[df['id'].apply(lambda x: len(x)>13)]

Unnamed: 0,id,type,date,kind,num_claims
2640192,eceptionunitsSequentiallyduringdatatransmission,,,,
2640193,eceptionwhetherornotatransmissionsignalhasbeen...,,,,
2640194,eceptionunitreceivingsaidsignalisusedfordatatr...,,,,
2640195,eceptionviaopticalcommunicationswithanexternal...,Device provided with an optical communications...,pg030624.zip,,
3724933,nendofmessageindicatorandusesthevarioustestres...,Unsolicited message intercepting communication...,ipg100302.xml,,


In [14]:
# drop five rows with error
df=df[df['id'].apply(lambda x: len(x)<13)]

In [15]:
df['num_claims']=pd.to_numeric(df['num_claims'], errors='coerce')

###### df['kind']=df['kind'].astype(str)

In [16]:
df[df['kind'].apply(lambda x: len(str(x))>13)]

Unnamed: 0,id,type,date,kind,num_claims
4232499,8182619,utility,2012-05-22,Generation of test cases with range constraint...,
4267896,8225415,utility,2012-07-17,"Method of storing GaN substrate, stored substr...",
4327678,8288508,utility,2012-10-16,System and method of epsilon removal of weight...,
4368026,8331281,utility,2012-12-11,Computer-implemented method of design of surfa...,
4377088,8341296,utility,2012-12-25,Methods of arranging mask patterns and associa...,
4426413,8391122,utility,2013-03-05,System and method for real-time cloth simulation,


In [17]:
df=df[df['kind'].apply(lambda x: len(str(x))<13)]

In [18]:
df.groupby('kind').count()

array(['A', 'E', 'S', 'I5', 'P', 'B1', 'B2', nan, 'S1', 'H', 'H1', 'H2',
       'P2', 'P3', 'E1', 'I4'], dtype=object)

In [19]:
df.groupby('type').count()

array(['utility', 'reissue', 'design', 'TVPP', 'plant',
       'statutory invention registration', nan, 'defensive publication'], dtype=object)

In [20]:
df.describe(include=['np.object'])

  interpolation=interpolation)


Unnamed: 0,num_claims
count,6488254.0
mean,14.14322
std,12.00423
min,0.0
25%,
50%,
75%,
max,887.0


In [21]:
df['num_claims'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f5112165518>

In [22]:
df.dtypes

id             object
type           object
date           object
kind           object
num_claims    float64
dtype: object

In [23]:
df.set_index('id').to_csv(dst)