In [1]:
#Miami, December 23th, 2019
#Renato Kogeyama

# This script generates a citation ouput with patent|backward citation|cumulated backward citation | year
# The output generates a citation record by year, considering direct and indirect citation


In [2]:
import pandas as pd

import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler


In [3]:
fname = r'/home/rkogeyam/PATENT_CITATION/uspatentcitation.tsv'

In [4]:
%%time
df=pd.read_csv(fname, sep='\t')

# sample_size=1000
# df=sampler(fname, sample_size, sep='\t')

CPU times: user 5min 43s, sys: 12 s, total: 5min 55s
Wall time: 5min 57s


In [5]:
%%time
#stripping non desired characters but keeping the originals for later check
#could merge unwanted information
df['patent_id_raw']=df['patent_id']
df['patent_id'] = df['patent_id'].str.extract('([a-zA-Z0-9]+)', expand=False)

df['citation_id_raw']=df['citation_id']
df['citation_id'] = df['citation_id'].str.extract('([a-zA-Z0-9]+)', expand=False)

#stripping trailing white spaces
df['patent_id'] = df['patent_id'].str.strip()
df['citation_id'] = df['citation_id'].str.strip()


CPU times: user 4.97 ms, sys: 942 µs, total: 5.91 ms
Wall time: 5.76 ms


In [6]:
%%time
#to avoid problems when converting object to datetime format
#replacing day or month would not affect the output
df.date.replace({'-00':'-01'}, regex=True, inplace=True)
#replacing the year could be more problematic
#for now, I am just dropping


CPU times: user 1.09 ms, sys: 3 µs, total: 1.09 ms
Wall time: 1.1 ms


In [7]:
%%time
df['date']=pd.to_datetime(df.date,format='%Y-%m-%d', errors='coerce') #attention to date format - original data is year-month-day
# df['date_format']=pd.to_datetime(df.date,format='%Y-%m-%d', errors='coerce')

CPU times: user 1.71 ms, sys: 0 ns, total: 1.71 ms
Wall time: 50.7 ms


In [8]:
%%time
#script will take grant year into account
df.sort_values(by='date', ascending=False, inplace=True)

CPU times: user 457 µs, sys: 2.02 ms, total: 2.47 ms
Wall time: 74 ms


In [9]:
%%time
#first, i'll run only patents with some kind of citation
#as we have many zero citations in the dataset, this should decrease processing time

cited_patents=df.groupby(['citation_id']).count().iloc[:, 1] #Series

CPU times: user 2.08 ms, sys: 1.9 ms, total: 3.98 ms
Wall time: 14.9 ms


In [10]:
%%time
cited_patents.dropna(0, inplace=True) #Series

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 22.4 µs


In [11]:
%%time
cited_patents=cited_patents.reset_index() #Dataframe

CPU times: user 1.59 ms, sys: 0 ns, total: 1.59 ms
Wall time: 1.49 ms


In [12]:
%%time
cited_patents.set_index('citation_id', inplace=True)

CPU times: user 798 µs, sys: 0 ns, total: 798 µs
Wall time: 661 µs


In [13]:
%%time
cited_patents

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


Unnamed: 0_level_0,patent_id
citation_id,Unnamed: 1_level_1
1224846,1
1267397,1
142286,1
1427891,1
1468566,1
1639363,1
1815268,1
1855455,1
1907506,1
1921568,1


In [14]:
%%time
df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


Unnamed: 0,uuid,patent_id,citation_id,date,name,kind,country,category,sequence,patent_id_raw,citation_id_raw
15,0nbtozop029bost3nkl94h825,9623592,9185937,2015-11-01,Liu,B2,US,cited by examiner,3,9623592,9185937
365,diktc4trylu8foni4d59go4e6,9452356,9138639,2015-09-01,Ernst,B1,US,cited by applicant,56,9452356,9138639
70,2f6mnwfxpvlxttsc0xt1a707n,D760738,D726748,2015-04-01,Maekawa,S,US,cited by applicant,49,D760738,D726748
174,6fs4kvuklklb1qpg715ljvir3,D737835,D725132,2015-03-01,Jou,S,US,cited by examiner,64,D737835,D725132
341,coiz7zow26lzmdjr6r132pat0,D711888,D697069,2014-01-01,Tak et al.,S,US,cited by examiner,11,D711888,D697069
402,eeqoeavlnfkfmptex2xn1kybk,9143529,8584114,2013-11-01,Rabinovich et al.,B2,US,cited by examiner,77,9143529,8584114
504,icb80b0jcur6895v4tatvy8s0,9160511,8565082,2013-10-01,Vargantwar et al.,B1,US,cited by examiner,2,9160511,8565082
93,3bl2c4by8hjpxpztjv4w8awit,9042058,8493695,2013-07-01,Kaiser et al.,B1,US,cited by applicant,561,9042058,8493695
86,30wmh1vwoxqx4a2g9iwhjzwmw,9178701,8464354,2013-06-01,Teow et al.,B2,US,cited by applicant,42,9178701,8464354
453,gfx23b2aqyohmpb3bjy5i9mk6,8816402,8453094,2013-05-01,Kornachuk et al.,B2,US,cited by applicant,513,8816402,8453094


In [15]:
%%time
cited_patents.rename(columns={'patent_id': 'back_citation'}, inplace=True)

CPU times: user 378 µs, sys: 13 µs, total: 391 µs
Wall time: 392 µs


In [16]:
%%time
df=df.merge(cited_patents, left_on='citation_id', right_index=True)

CPU times: user 4.13 ms, sys: 40 µs, total: 4.17 ms
Wall time: 41.7 ms


In [17]:
%%time
cited_patents.columns

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.68 µs


Index(['back_citation'], dtype='object')

In [18]:
%%time
cited_patents['cumulative']=cited_patents.back_citation

CPU times: user 462 µs, sys: 17 µs, total: 479 µs
Wall time: 477 µs


In [19]:
%%time
cited_patents.dtypes

CPU times: user 181 µs, sys: 6 µs, total: 187 µs
Wall time: 193 µs


back_citation    int64
cumulative       int64
dtype: object

In [20]:
%%time
df['cumulative']=df.back_citation

CPU times: user 421 µs, sys: 0 ns, total: 421 µs
Wall time: 418 µs


In [21]:
%%time
df.dtypes

CPU times: user 224 µs, sys: 8 µs, total: 232 µs
Wall time: 238 µs


uuid                       object
patent_id                  object
citation_id                object
date               datetime64[ns]
name                       object
kind                       object
country                    object
category                   object
sequence                    int64
patent_id_raw              object
citation_id_raw            object
back_citation               int64
cumulative                  int64
dtype: object

In [22]:
%%time
# for i in cited_patents.index:
#     citing_patents=df[df.citation_id==i].patent_id  #this is the trick element
#                                                     #for each cited patent i generate a list of citing patents

#     cumulative=0 #initialize the citation cumulated variable for each cited patent

#     for j in citing_patents:
#         cumulative=+df[df.citation_id==j].sum().cumulative #each cited patent contributes to overall cumulative score
# #         print(cumulative)
        
# #     print(cited_patents.loc[cited_patents['citation_id']==i,'cumulative'])
#     cited_patents.at[i,'cumulative']=+cumulative
    

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


In [23]:
%%time
len(cited_patents.index)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 10 µs


591

In [24]:
%%time
cited_patents.describe()

CPU times: user 6.17 ms, sys: 1.01 ms, total: 7.18 ms
Wall time: 7.76 ms


Unnamed: 0,back_citation,cumulative
count,591.0,591.0
mean,1.0,1.0
std,0.0,0.0
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,1.0,1.0
