In [1]:
# Script to evaluate citation delay
# Backward citation - citation made by a patent
# Forward citation - citation received by a patent

# Renato Kogeyama

# Jan 03 2019
# Miami
# I am using this script to calculate the average delay in citation - to follow Hall et al, 2001

# I use two sources, uspatentcitation.tsv and patent.csv
# The first is a citation-level dataset with information about the citing patent
# The second is a patent-level dataset with information about the patent

# Cleaning
# I tested in other scripts the quality of the patent identifier
# It does not require cleaning - only 4 erros from 6 million patents
# The cleaning script is there anyway

# Merging
# I merge on the citation level (df)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

In [4]:
fname = r'/home/rkogeyam/PATENT_CITATION/data/uspatentcitation.tsv'
patent= 'data/patent.csv'

dst = '/home/rkogeyam/PATENT_CITATION/data/back_cit_delay.csv'
# dst_forw = '/home/rkogeyam/PATENT_CITATION/data/forw_cit_delay.csv'

In [5]:
%%time
df = pd.read_csv(fname, sep='\t', lineterminator='\n', na_values='-')
pt_df = pd.read_csv(patent, sep='\t', na_values='-', usecols=[0,4], error_bad_lines=False, index_col=0, dtype={0: object})

CPU times: user 4min 57s, sys: 19.7 s, total: 5min 16s
Wall time: 5min 17s


In [6]:
df=df.iloc[:,1:]

In [7]:
df.head()

Unnamed: 0,patent_id,citation_id,date,name,kind,country,category,sequence
0,9009250,8127342,2012-02-01,Boynton et al.,B2,US,cited by applicant,622
1,9643605,5471515,1995-11-01,Fossum et al.,A,US,cited by applicant,413
2,5354551,4875247,1989-10-01,Berg,,US,,11
3,D786922,D718330,2014-11-01,Shin,S,US,cited by examiner,11
4,D490798,D190749,1961-06-01,Matri et al.,S,US,cited by examiner,1


In [8]:
pt_df.head()

Unnamed: 0_level_0,date
id,Unnamed: 1_level_1
3930271,1976-01-06
3930272,1976-01-06
3930273,1976-01-06
3930274,1976-01-06
3930275,1976-01-06


In [9]:
# #stripping non desired characters
# df['patent_id'] = df['patent_id'].str.extract('([a-zA-Z0-9]+)', expand=False)
# df['citation_id'] = df['citation_id'].str.extract('([a-zA-Z0-9]+)', expand=False)

In [10]:
df['patent_id'] = df['patent_id'].str.strip()
df['citation_id'] = df['citation_id'].str.strip()

In [11]:
#string to date format

df=df.rename(columns = {'date':'citation_date'})
df['citation_date']=pd.to_datetime(df['citation_date'], format="%Y-%m-%d", errors='coerce') 

# df['cit_year']=df['citation_date'].dt.year
# df['cit_month']=df['citation_date'].dt.month
# df['cit_day']=df['citation_date'].dt.day

In [12]:
# merge
df=pd.merge(df, pt_df, how='left', left_on='patent_id', right_index=True)

In [13]:
# check citations without patents
# df[.isna().sum()

In [14]:
# date format to allow calculations
df=df.rename(columns = {'date':'patent_date'})
df['patent_date']=pd.to_datetime(df['patent_date'], format="%Y-%m-%d", errors='coerce') #conversao de string para data

In [15]:
df.shape

(91453297, 9)

In [16]:
%%time
df=df.dropna()

CPU times: user 42 s, sys: 1.97 s, total: 44 s
Wall time: 44 s


In [17]:
df.shape

(69415348, 9)

In [18]:
# delay is the time interval between grant and citation
df['cit_delay']=df['patent_date'].sub(df['citation_date'], axis=0)

In [19]:
# convert to date format
df['cit_delay']=pd.to_timedelta(df['cit_delay'])

In [20]:
df.describe()

Unnamed: 0,sequence,cit_delay
count,69415350.0,69415348
mean,84.15553,5587 days 03:16:23.739630
std,198.3438,5448 days 12:30:49.244062
min,0.0,28 days 00:00:00
25%,5.0,2333 days 00:00:00
50%,19.0,4201 days 00:00:00
75%,74.0,6772 days 00:00:00
max,5840.0,85037 days 00:00:00


In [21]:
# convert to interval in years
df['cit_delay']=df['cit_delay'].dt.days/360

In [22]:
# output: patent-level data
# if group by citation_id, the delay will give the average delay gives how much time it took,
#    in average, for a patent to receive citations
# it is a measure of time to become influent

back_delay=df.groupby('citation_id').cit_delay.agg(['mean', 'count'])
back_delay.rename(columns={'count': 'back_citation', 'mean':'mean_back_delay'}, inplace=True)
back_delay.head()

Unnamed: 0_level_0,mean_back_delay,back_citation
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4,172.125,1
6,170.705556,1
169,174.619444,3
280,2.969444,1
491,166.888889,1


In [23]:
# back_delay.to_csv(dst)

In [24]:
# if you group by patent_id, the delay will give a measure of how far back the patent is rooted.

forw_delay=df.groupby('patent_id').cit_delay.agg(['mean', 'count'])
forw_delay.rename(columns={'count': 'forw_citation', 'mean':'mean_forw_delay'}, inplace=True)
forw_delay.head()

Unnamed: 0_level_0,mean_forw_delay,forw_citation
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6334220,24.786782,29
6334221,12.896368,26
6334222,30.01088,12
6334223,26.281019,6
6334224,13.412222,15


In [25]:
output=back_delay.merge(forw_delay, left_index=True, right_index=True, how='outer')

In [26]:
output.to_csv(dst)