In [1]:
# Script to calculate generality and  application data
# Generality: how diverse is the impact of a patent
# This is done by calculating the herfindal index of citing patents

In [2]:
# Feb 7th, 2020
# While the previous version is already up, I want to improve the script
# I want to make calculations with only one script
# Also, I want to compare different class systems
# this version took 10 minutes - i am moving the old version away

# Jan 16th, 2020
# Due to performance problems in the HPC, this script was divided in two, the script following this is generality_2
# generality > 1 is not an error, but a consequence of adopting WIPO
# it seems that the original calculation had only one class per patent
# WIPO provides multiple classes - so when you divide by the total number of citations, 
#  you do not have the proportion of classes cited anymore
# to correct this issue, I can calculate Generality and Originality based only on the first WIPO class

# Jan 13th, 2020
# Script is running but there are two major issues
# - there should not exist generality > 1 , so there is an error in calculation
# - too many NANs (about 400k), but I will tackle this issue in 'too_many_nans.ipynb'

# to tackle the first problem, I'll begin by creating a subset of the database
# to do that, I'll use USPTO classification system

In [3]:
# Trying to save memory is leading to a small nightmare 
# I am postponing the use of dask modules

In [4]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re

import dask.dataframe as dd

In [5]:
citation= '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv'
classification = '/home/rkogeyam/PATENT_CITATION/data/wipo.csv' #avoid multiple classes

dst= '/home/rkogeyam/PATENT_CITATION/data/generality.csv'

In [6]:
%%time
class_df=pd.read_csv(classification)
citation_df=pd.read_csv(citation, sep=',', usecols=['patent_id', 'citation_id'])

CPU times: user 2min 17s, sys: 7.32 s, total: 2min 24s
Wall time: 2min 29s


In [7]:
#to guarantee same format for the merge
class_df['id']=class_df['id'].astype(str)
citation_df=citation_df.astype(str)

In [8]:
#join on index is faster
class_df.set_index('id', inplace=True)
citation_df.set_index('patent_id', inplace=True)

In [9]:
citation_df.shape

(91453297, 1)

In [10]:
%%time
#citation level dataset
#join is faster than merge
df=citation_df.join(class_df, how='inner')  

#the left dataframe is citation df, which is indexed by patent_id
#when later on I group by citation_id is very possible that NaNs appear

CPU times: user 2min 32s, sys: 3.43 s, total: 2min 35s
Wall time: 2min 35s


In [11]:
# df.to_csv(dst)

In [12]:
df = pd.get_dummies(df, columns=['wipo_sector_id'])

In [13]:
df.head()

Unnamed: 0,citation_id,wipo_sector_id_0,wipo_sector_id_1,wipo_sector_id_2,wipo_sector_id_3,wipo_sector_id_4
3930271,2379430,0.0,0.0,0.0,0.0,1.0
3930271,2782422,0.0,0.0,0.0,0.0,1.0
3930272,2560109,0.0,0.0,0.0,0.0,1.0
3930272,2545289,0.0,0.0,0.0,0.0,1.0
3930272,1549144,0.0,0.0,0.0,0.0,1.0


In [14]:
%time
total_citation=df.groupby('citation_id').count().iloc[:,0]
total_citation=np.square(total_citation)
total_citation.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.72 µs


citation_id
0          1
0000000    1
000004     1
000006     1
0000482    1
Name: wipo_sector_id_0, dtype: int64

In [15]:
%time
# df.drop('citation_id', inplace=True, axis=1)
df=df.groupby('citation_id').sum().fillna(0)
df.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


Unnamed: 0_level_0,wipo_sector_id_0,wipo_sector_id_1,wipo_sector_id_2,wipo_sector_id_3,wipo_sector_id_4
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,1.0,0.0
0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
6,0.0,1.0,0.0,0.0,0.0
482,0.0,1.0,0.0,0.0,0.0


In [16]:
%time
df_squared=np.square(df) #element-wise squaring
df_squared=df_squared.sum(axis='columns') #sum all columns, per row
df_squared.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.72 µs


citation_id
0          1.0
0000000    1.0
000004     1.0
000006     1.0
0000482    1.0
dtype: float64

In [17]:
%time
df2=pd.concat([df_squared, total_citation], axis=1)
df2.columns=['df_squared', 'total_citation']
df2.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


Unnamed: 0_level_0,df_squared,total_citation
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,1
0,1.0,1
4,1.0,1
6,1.0,1
482,1.0,1


In [18]:
%time
df2['herfindal']=df2['df_squared']/df2['total_citation'] #its a measure of concentration

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [19]:
%time
df2['generality']=1-df2['herfindal'] # as defined in Hall et al, 2001
df2['generality'].hist()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


<matplotlib.axes._subplots.AxesSubplot at 0x7f1ec88e1438>

In [20]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6555152 entries, 0 to re25727
Data columns (total 4 columns):
df_squared        float64
total_citation    int64
herfindal         float64
generality        float64
dtypes: float64(3), int64(1)
memory usage: 250.1+ MB


In [21]:
df2.head()

Unnamed: 0_level_0,df_squared,total_citation,herfindal,generality
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.0,1,1.0,0.0
0,1.0,1,1.0,0.0
4,1.0,1,1.0,0.0
6,1.0,1,1.0,0.0
482,1.0,1,1.0,0.0


In [22]:
df2.describe()

Unnamed: 0,df_squared,total_citation,herfindal,generality
count,6555152.0,6555152.0,6555152.0,6555152.0
mean,921.8369,1105.135,0.8528391,0.1471609
std,18882.34,21189.08,0.21072,0.21072
min,1.0,1.0,0.2,0.0
25%,4.0,4.0,0.68,0.0
50%,16.0,25.0,1.0,0.0
75%,106.0,144.0,1.0,0.32
max,12347590.0,15186610.0,1.0,0.8


In [23]:
df2.to_csv(dst)

In [24]:
# %time
# df2=df_squared.to_frame().join(total_citation.to_frame())
# df2.head()