In [1]:
# Script to calculate originality

# Generality: how diverse is the impact of a patent
# This is done by calculating the herfindal index of citing patents

In [2]:
# Jan 22th, 2020
# Copy and paste from generality
# Join generality and generality_2
# mirror the script to reflect originality
# the only difference should be to change 'citation_id' by 'patent_id' in some fields
# total citations tend to be much smaller


# Jan 16th, 2020
# Due to performance problems in the HPC, this script was divided in two, the script following this is generality_2
# generality > 1 is not an error, but a consequence of adopting WIPO
# it seems that the original calculation had only one class per patent
# WIPO provides multiple classes - so when you divide by the total number of citations, 
#  you do not have the proportion of classes cited anymore
# to correct this issue, I can calculate Generality and Originality based only on the first WIPO class

# Jan 13th, 2020
# Script is running but there are two major issues
# - there should not exist generality > 1 , so there is an error in calculation
# - too many NANs (about 400k), but I will tackle this issue in 'too_many_nans.ipynb'

# to tackle the first problem, I'll begin by creating a subset of the database
# to do that, I'll use USPTO classification system

In [3]:
# Trying to save memory is leading to a small nightmare 
# I am postponing the use of dask modules

In [4]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re

import dask.dataframe as dd

In [5]:
# wipo_horiz.csv
# from wipo_horiz.ipynb
# Original WIPO classification reshaped
# patent_id:    patent number
# field_id_n :  'n' is the WIPO class

# uspatentcitation.tsv
# uuid:         unique id
# patent_id:    patent number
# citation_id:  identifying number of patent to which select patent cites
# date:         date select patent (patent_id) cites patent (citation_id)
# name:         name of cited record
# kind:         WIPO document kind codes 
#               (http://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent)	2002 and After
# country:      country cited patent was granted (always US)
# category:     who cited the patent (examiner, applicant, other etc) - 2002 and After
# sequence:     order in which this reference is cited by select patent	all


In [6]:
citation= '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv'
wipo = '/home/rkogeyam/PATENT_CITATION/data/wipo_first_class.csv' #avoid multiple classes

dst= '/home/rkogeyam/PATENT_CITATION/data/originality_temp.csv'

In [7]:
%%time
usecols=['patent_id','field_id']
dtype={'patent_id':object,'field_id':object}
wipo_df=pd.read_csv(wipo, usecols=usecols, dtype=dtype)

usecols=['patent_id', 'citation_id']
dtype={'patent_id':object,'citation_id':object}
df=pd.read_csv(citation, usecols=usecols, dtype=dtype)

# sample_size=100
# wipo_df=sampler(wipo, sample_size)
# citation_df=sampler(citation, sample_size)

CPU times: user 2min 15s, sys: 7.79 s, total: 2min 23s
Wall time: 2min 23s


In [8]:
%%time
#citation level dataset
#join is faster than merge
wipo_df['patent_id']=wipo_df['patent_id'].astype(str)
df=df.astype(str)

wipo_df.set_index('patent_id', inplace=True)
df.set_index('citation_id', inplace=True)
#different from generality - we want to know the class of cites made

df=df.join(wipo_df, how='inner')  

#the left dataframe is citation df, which is indexed by patent_id
#when later on I group by citation_id is very possible that NaNs appear

CPU times: user 3min 14s, sys: 3.93 s, total: 3min 18s
Wall time: 3min 18s


In [9]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 73342208 entries, 10001951 to 9999473
Data columns (total 2 columns):
patent_id    object
field_id     object
dtypes: object(2)
memory usage: 1.6+ GB


In [10]:
df.describe()

Unnamed: 0,patent_id,field_id
count,73342208,73342208
unique,5609709,35
top,8892495,13
freq,5768,10554877


In [11]:
df.sample(n=5)

Unnamed: 0,patent_id,field_id
6382923,9649418,27
4963468,6194222,11
7095204,8729840,1
5991415,8861718,4
6506215,8147557,13


In [12]:
df.to_csv(dst)

In [13]:
# #from generality_2

# # group on citation_id - thus adding all patents citing one receiver (citation_id)
# # thus this is generality
# # its the impact of a given patent

# # the index is equivalent to 1 - herfindal
# # this way, the closer to 1, the more general

# # the denominator is total citations squared
# # should be easy to check with citation values
# total_citation=df.groupby('patent_id').count().iloc[:,0] #different from generality
# total_citation=np.square(total_citation)

# # the numerator is the sum of the squares of citations in each class
# df=df.groupby('patent_id').sum().fillna(0) #in theory, I would not need fillna, check later
# df=np.square(df) #element-wise squaring
# df=df.sum(axis='columns') #sum all columns, per row

# df=pd.concat([df, total_citation], axis=1)
# df.columns=['df_squared', 'total_citation']
# #herfindal 

# df['herfindal']=df['df_squared']/df['total_citation'] #its a measure of concentration

# df['output']=1-df['herfindal'] # as defined in Hall et al, 2001

# df.to_csv(dst)