In [1]:
# Script to evaluate citation delay
# Backward citation - citation made by a patent
# Forward citation - citation received by a patent

# Renato Kogeyama

# Jan 17 2020
# Join cit_delay with var_builder
# The only thing var_builder was doing was including kind and type 


# Jan 03 2020
# Miami
# I am using this script to calculate the average delay in citation - to follow Hall et al, 2001
# patent.csv has the following columns
# id 	type 	number 	country 	date 	abstract 	title 	kind 	num_claims 	filename
# interest on id, type, date, kind, num_claims

# I use two sources, uspatentcitation.tsv and patent.csv
# The first is a citation-level dataset with information about the citing patent
# The second is a patent-level dataset with information about the patent

# Cleaning
# I tested in other scripts the quality of the patent identifier
# It does not require cleaning - only 4 erros from 6 million patents
# The cleaning script is there anyway

# Merging
# I merge on the citation level (df)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

In [4]:
citation_df = '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv'
# cit_tree = '/home/rkogeyam/PATENT_CITATION/data/cit_tree.csv'

patent= 'data/patent.csv'
dst='data/var_builder.csv'

# citation_df = r'/home/rkogeyam/PATENT_CITATION/data/uspatentcitation.tsv'
# dst = '/home/rkogeyam/PATENT_CITATION/data/back_cit_delay.csv'
# dst_forw = '/home/rkogeyam/PATENT_CITATION/data/forw_cit_delay.csv'

In [5]:
%%time
df = pd.read_csv(citation_df, usecols=['patent_id', 'citation_id', 'date', 'kind', 'category', 'sequence'], dtype=object)

CPU times: user 4min 5s, sys: 19.4 s, total: 4min 25s
Wall time: 4min 25s


In [6]:
pt_df = pd.read_csv(patent, sep='\t', usecols=['id', 'type', 'date', 'kind', 'num_claims'], error_bad_lines=False, index_col=0, dtype=object)

In [7]:
# df=df.iloc[:,1:]

In [8]:
df.head()
# df.get_chunk().head()

Unnamed: 0,patent_id,citation_id,date,kind,category,sequence
0,9009250,8127342,2012-02-01,B2,cited by applicant,622
1,9643605,5471515,1995-11-01,A,cited by applicant,413
2,5354551,4875247,1989-10-01,,,11
3,D786922,D718330,2014-11-01,S,cited by examiner,11
4,D490798,D190749,1961-06-01,S,cited by examiner,1


In [9]:
pt_df.head()
# pt_df.get_chunk().head()

Unnamed: 0_level_0,type,date,kind,num_claims
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3930271,utility,1976-01-06,A,4
3930272,utility,1976-01-06,A,3
3930273,utility,1976-01-06,A,24
3930274,utility,1976-01-06,A,7
3930275,utility,1976-01-06,A,9


In [10]:
df=df.rename(columns = {'date':'citation_date'})
df['citation_date']=pd.to_datetime(df['citation_date'], format="%Y-%m-%d", errors='coerce') 

In [11]:
# merge between patent data and citations on patent_id (citing)
# merging on the citation dataset drops patents without citing
# later i could standardize to make patent_id index and use join instead of merge

df=pd.merge(df, pt_df, how='left', left_on='patent_id', right_index=True)

In [12]:
# date format to allow calculations
df=df.rename(columns = {'date':'patent_date'})
df['patent_date']=pd.to_datetime(df['patent_date'], format="%Y-%m-%d", errors='coerce') #conversao de string para data

In [13]:
# df.shape

In [14]:
# if I do not drop nans, the script raises an error later when converting day interval into years
# I could substitute with average instead of dropping, this way I do not lose the citation info
# however, not always it will be possible to average - cases where there is ony one citation, for example
# For this reason, at this point, I'll keep the NAN and circumvent the issues as they arise

# df=df.dropna()

In [15]:
# df.shape

In [16]:
# delay is the time interval between grant and citation
df['cit_delay']=df['patent_date'].sub(df['citation_date'], axis=0)

In [17]:
# convert to date format
df['cit_delay']=pd.to_timedelta(df['cit_delay'])

In [18]:
df.head()

Unnamed: 0,patent_id,citation_id,citation_date,kind_x,category,sequence,type,patent_date,kind_y,num_claims,cit_delay
0,9009250,8127342,2012-02-01,B2,cited by applicant,622,utility,2015-04-14,B2,27,1168 days
1,9643605,5471515,1995-11-01,A,cited by applicant,413,utility,2017-05-09,B2,28,7860 days
2,5354551,4875247,1989-10-01,,,11,utility,1994-10-11,A,6,1836 days
3,D786922,D718330,2014-11-01,S,cited by examiner,11,design,2017-05-16,S1,1,927 days
4,D490798,D190749,1961-06-01,S,cited by examiner,1,design,2004-06-01,S1,1,15706 days


In [19]:
df.describe()

Unnamed: 0,patent_id,citation_id,citation_date,kind_x,category,sequence,type,patent_date,kind_y,num_claims,cit_delay
count,91453297.0,91453274.0,89317104,69801447,69691945,91453297.0,91178541,91178552,91178519,91178520.0,89042363
unique,6188752.0,7051646.0,2695,23,4,5840.0,6,2218,19,390.0,
top,8892495.0,4683202.0,1999-11-01 00:00:00,A,cited by other,0.0,utility,2017-09-12 00:00:00,B2,1.0,
freq,5811.0,3922.0,548047,37517961,25324075,6161007.0,82560618,154315,53614935,9090531.0,
first,,,1686-12-01 00:00:00,,,,,1976-01-06 00:00:00,,,
last,,,2198-03-01 00:00:00,,,,,2017-12-26 00:00:00,,,
mean,,,,,,,,,,,5589 days 05:25:04.672624
std,,,,,,,,,,,5683 days 05:08:48.181496
min,,,,,,,,,,,-105998 days +00:25:26.290448
25%,,,,,,,,,,,2151 days 00:00:00


In [20]:
# convert to interval in years
# df['cit_delay']=df['cit_delay'].dt.days/360

In [21]:
df.to_csv(dst)