In [1]:
# Script to calculate generality and  application data
# Generality: how diverse is the impact of a patent
# This is done by calculating the herfindal index of citing patents

In [2]:
# Feb 7th, 2020
# While the previous version is already up, I want to improve the script
# I want to make calculations with only one script
# Also, I want to compare different class systems
# this version took 10 minutes - i am moving the old version away

# Jan 16th, 2020
# Due to performance problems in the HPC, this script was divided in two, the script following this is generality_2
# generality > 1 is not an error, but a consequence of adopting WIPO
# it seems that the original calculation had only one class per patent
# WIPO provides multiple classes - so when you divide by the total number of citations, 
#  you do not have the proportion of classes cited anymore
# to correct this issue, I can calculate Generality and Originality based only on the first WIPO class

# Jan 13th, 2020
# Script is running but there are two major issues
# - there should not exist generality > 1 , so there is an error in calculation
# - too many NANs (about 400k), but I will tackle this issue in 'too_many_nans.ipynb'

# to tackle the first problem, I'll begin by creating a subset of the database
# to do that, I'll use USPTO classification system

In [3]:
# Trying to save memory is leading to a small nightmare 
# I am postponing the use of dask modules

In [4]:
import sys
sys.path.append('/home/rkogeyam/scripts/')
from sampler import sampler

import pandas as pd
import numpy as np
import re

import dask.dataframe as dd

import gzip

In [5]:
citation= '/home/rkogeyam/PATENT_CITATION/data/cleanuspatentcitation.csv.gz'
classification = '/home/rkogeyam/PATENT_CITATION/data/wipo.csv.gz' #avoid multiple classes

dst= '/home/rkogeyam/PATENT_CITATION/data/generality_temp.csv.gz'

In [6]:
%%time
file_cit=gzip.open(citation, 'r')
citation_df=pd.read_csv(file_cit, sep=',', usecols=['patent_id', 'citation_id'])

file_class=gzip.open(classification, 'r')
class_df=pd.read_csv(file_class)

CPU times: user 3min 49s, sys: 8.91 s, total: 3min 58s
Wall time: 3min 58s


In [7]:
#to guarantee same format for the merge
class_df['id']=class_df['id'].astype(str)
citation_df=citation_df.astype(str)

In [8]:
#join on index is faster
class_df.set_index('id', inplace=True)
citation_df.set_index('patent_id', inplace=True)

In [9]:
citation_df.shape

(108913836, 1)

In [10]:
%%time
#citation level dataset
#join is faster than merge
df=citation_df.join(class_df, how='inner')  

#the left dataframe is citation df, which is indexed by patent_id
#when later on I group by citation_id is very possible that NaNs appear

CPU times: user 3min 9s, sys: 3.82 s, total: 3min 13s
Wall time: 3min 12s


In [11]:
# df.to_csv(dst)

In [12]:
df = pd.get_dummies(df, columns=['wipo_sector_id'])

In [13]:
df.head()

Unnamed: 0,citation_id,wipo_field_id,wipo_sector_id_0,wipo_sector_id_1,wipo_sector_id_2,wipo_sector_id_3,wipo_sector_id_4
10000000,5093563,10,0.0,0.0,1.0,0.0,0.0
10000000,5751830,10,0.0,0.0,1.0,0.0,0.0
10000001,9296144,29,0.0,0.0,0.0,1.0,0.0
10000001,7804268,29,0.0,0.0,0.0,1.0,0.0
10000001,9566732,29,0.0,0.0,0.0,1.0,0.0


In [14]:
%time
total_citation=df.groupby('citation_id').count().iloc[:,0]
total_citation=np.square(total_citation)
total_citation.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs


citation_id
0          1
0000000    1
000004     1
000006     1
0000482    1
Name: wipo_field_id, dtype: int64

In [15]:
%time
# df.drop('citation_id', inplace=True, axis=1)
df=df.groupby('citation_id').sum().fillna(0)
df.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


Unnamed: 0_level_0,wipo_field_id,wipo_sector_id_0,wipo_sector_id_1,wipo_sector_id_2,wipo_sector_id_3,wipo_sector_id_4
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,25,0.0,0.0,0.0,1.0,0.0
0,29,0.0,0.0,0.0,1.0,0.0
4,28,0.0,0.0,0.0,1.0,0.0
6,2,0.0,1.0,0.0,0.0,0.0
482,4,0.0,1.0,0.0,0.0,0.0


In [16]:
%time
df_squared=np.square(df) #element-wise squaring
df_squared=df_squared.sum(axis='columns') #sum all columns, per row
df_squared.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


citation_id
0          626.0
0000000    842.0
000004     785.0
000006       5.0
0000482     17.0
dtype: float64

In [17]:
%time
df2=pd.concat([df_squared, total_citation], axis=1)
df2.columns=['df_squared', 'total_citation']
df2.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16.7 µs


Unnamed: 0_level_0,df_squared,total_citation
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,626.0,1
0,842.0,1
4,785.0,1
6,5.0,1
482,17.0,1


In [18]:
%time
df2['herfindal']=df2['df_squared']/df2['total_citation'] #its a measure of concentration

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


In [19]:
%time
df2['generality']=1-df2['herfindal'] # as defined in Hall et al, 2001
df2['generality'].hist()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


<matplotlib.axes._subplots.AxesSubplot at 0x7f68ba1b9828>

In [20]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7083682 entries, 0 to re25727
Data columns (total 4 columns):
df_squared        float64
total_citation    int64
herfindal         float64
generality        float64
dtypes: float64(3), int64(1)
memory usage: 270.2+ MB


In [21]:
df2.head()

Unnamed: 0_level_0,df_squared,total_citation,herfindal,generality
citation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,626.0,1,626.0,-625.0
0,842.0,1,842.0,-841.0
4,785.0,1,785.0,-784.0
6,5.0,1,5.0,-4.0
482,17.0,1,17.0,-16.0


In [22]:
df2.describe()

Unnamed: 0,df_squared,total_citation,herfindal,generality
count,7083682.0,7083682.0,7083682.0,7083682.0
mean,291153.5,1560.634,436.3063,-435.3063
std,5431029.0,32793.18,388.9476,388.9476
min,2.0,1.0,2.0,-1225.0
25%,788.0,4.0,65.0,-769.6875
50%,5328.0,25.0,314.5373,-313.5373
75%,44136.0,169.0,770.6875,-64.0
max,5059543000.0,20866620.0,1226.0,-1.0


In [23]:
df2.to_csv(dst, compression='gzip')

In [24]:
# %time
# df2=df_squared.to_frame().join(total_citation.to_frame())
# df2.head()