In [1]:
# Script to calculate generality and  application data
# Generality: how diverse is the impact of a patent
# This is done by calculating the herfindal index of citing patents

In [2]:
# Jan 16th, 2020
# Due to performance problems in the HPC, this script was divided in two, the script following this is generality_2
# generality > 1 is not an error, but a consequence of adopting WIPO
# it seems that the original calculation had only one class per patent
# WIPO provides multiple classes - so when you divide by the total number of citations, 
#  you do not have the proportion of classes cited anymore
# to correct this issue, I can calculate Generality and Originality based only on the first WIPO class

# Jan 13th, 2020
# Script is running but there are two major issues
# - there should not exist generality > 1 , so there is an error in calculation
# - too many NANs (about 400k), but I will tackle this issue in 'too_many_nans.ipynb'

# to tackle the first problem, I'll begin by creating a subset of the database
# to do that, I'll use USPTO classification system

In [3]:
# Trying to save memory is leading to a small nightmare 
# I am postponing the use of dask modules

In [4]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re

import dask.dataframe as dd

In [5]:
# wipo_horiz.csv
# from wipo_horiz.ipynb
# Original WIPO classification reshaped
# patent_id:    patent number
# field_id_n :  'n' is the WIPO class

# uspatentcitation.tsv
# uuid:         unique id
# patent_id:    patent number
# citation_id:  identifying number of patent to which select patent cites
# date:         date select patent (patent_id) cites patent (citation_id)
# name:         name of cited record
# kind:         WIPO document kind codes 
#               (http://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent)	2002 and After
# country:      country cited patent was granted (always US)
# category:     who cited the patent (examiner, applicant, other etc) - 2002 and After
# sequence:     order in which this reference is cited by select patent	all


In [6]:
citation= '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv'
dst= '/home/rkogeyam/PATENT_CITATION/data/generality_temp.csv'
# wipo = '/home/rkogeyam/PATENT_CITATION/data/wipo_horiz.csv'
# wipo = '/home/rkogeyam/PATENT_CITATION/data/wipo_kind_b1.csv' #using a subset of the data
wipo = '/home/rkogeyam/PATENT_CITATION/data/wipo_first_class.csv' #avoid multiple classes


In [7]:
%%time
wipo_df=pd.read_csv(wipo)
citation_df=pd.read_csv(citation, sep=',', usecols=['patent_id', 'citation_id'])

# sample_size=100
# wipo_df=sampler(wipo, sample_size)
# citation_df=sampler(citation, sample_size)

CPU times: user 2min 15s, sys: 7.38 s, total: 2min 22s
Wall time: 2min 22s


In [8]:
wipo_df.head()

Unnamed: 0.1,Unnamed: 0,patent_id,field_id
0,4,10000122,1
1,6,10000133,1
2,7,10000134,1
3,9,10000138,1
4,15,10000167,1


In [9]:
#to guarantee same format for the merge
wipo_df['patent_id']=wipo_df['patent_id'].astype(str)
citation_df=citation_df.astype(str)

In [10]:
#join on index is faster
wipo_df.set_index('patent_id', inplace=True)
citation_df.set_index('patent_id', inplace=True)

In [11]:
citation_df.shape

(91453297, 1)

In [12]:
%%time
#citation level dataset
#join is faster than merge
df=citation_df.join(wipo_df, how='inner')  

#the left dataframe is citation df, which is indexed by patent_id
#when later on I group by citation_id is very possible that NaNs appear

CPU times: user 2min 31s, sys: 3.83 s, total: 2min 34s
Wall time: 2min 34s


In [13]:
df.to_csv(dst)