In [1]:
# February 12st, 2020
# This script compares classification in each classification system
# Second level matches specifications of Nemet and Johnson 2012


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

import sys
sys.path.append('/home/rkogeyam/scripts/')
sys.path.append('scripts/')

from determinants_scripts import classes, dtypes

from classification import preprocessing

citation='data/cleanuspatentcitation.csv'

usecols=['uuid', 'patent_id', 'citation_id']
citation_df=pd.read_csv(citation, usecols=usecols, nrows=100000)


In [3]:
%matplotlib inline

In [4]:
sns.set()

In [5]:
# class_systems=['wipo']
class_systems=['wipo', 'ipcr', 'cpc', 'nber']

In [6]:
%%time
# This approach uses too much memory
# Try something else

for class_system in class_systems:
    
    classification = 'data/'+class_system+'.csv'
    
    class_df=pd.read_csv(classification, dtype=object, usecols=[0,1,2])
    
    #join class to patent_id
    df=preprocessing(class_df, citation_df)
    df.rename(columns={df.columns[2]:'level1_pat', df.columns[3]:'level2_pat'}, inplace=True)
    
    #join class to citation_id
    df=preprocessing(class_df, df, generality=False)
    
    
    #classify far external 
    far_ext=class_system+'_far_ext'
    df[far_ext]=np.where(df['level1_pat'] != df[df.columns[3]], 1, 0)
    
    #classify external
    ext=class_system+'_ext'
    df[ext]=np.where(df['level2_pat'] != df[df.columns[4]], 1, 0)
    
    df=df[['uuid', far_ext, ext]].set_index('uuid')
    print(df.head())

    #generate the output df if wipo, join if others
    if class_system=='wipo':
        output=df
    else:
        output=output.join(df)
        

                           wipo_far_ext  wipo_ext
uuid                                             
00j98gjluopjbahvckjxcsu5w             1         1
00ssb4om5j0ed1ws1bd6n3p8z             0         0
0119rbpayfbl601kj200as9rc             0         0
00y0veux7whsa003hv2crfxym             0         0
003imu4a4w19b2p2pavy111qh             1         1
                           ipcr_far_ext  ipcr_ext
uuid                                             
00j98gjluopjbahvckjxcsu5w             1         1
00ssb4om5j0ed1ws1bd6n3p8z             1         0
0119rbpayfbl601kj200as9rc             0         1
00y0veux7whsa003hv2crfxym             0         0
003imu4a4w19b2p2pavy111qh             1         1
                           cpc_far_ext  cpc_ext
uuid                                           
00j98gjluopjbahvckjxcsu5w            1        1
00ssb4om5j0ed1ws1bd6n3p8z            0        0
0119rbpayfbl601kj200as9rc            0        0
00y0veux7whsa003hv2crfxym            0        0
003imu4a4w19

In [7]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77333 entries, 00j98gjluopjbahvckjxcsu5w to 00xq424kkwvhic8c8ran7e8co
Data columns (total 8 columns):
wipo_far_ext    77333 non-null int64
wipo_ext        77333 non-null int64
ipcr_far_ext    72997 non-null float64
ipcr_ext        72997 non-null float64
cpc_far_ext     77324 non-null float64
cpc_ext         77324 non-null float64
nber_far_ext    64079 non-null float64
nber_ext        64079 non-null float64
dtypes: float64(6), int64(2)
memory usage: 5.3+ MB


In [8]:
output.describe()

Unnamed: 0,wipo_far_ext,wipo_ext,ipcr_far_ext,ipcr_ext,cpc_far_ext,cpc_ext,nber_far_ext,nber_ext
count,77333.0,77333.0,72997.0,72997.0,77324.0,77324.0,64079.0,64079.0
mean,0.195609,0.369338,0.295601,0.567681,0.28043,0.385456,0.248974,0.403908
std,0.396671,0.482629,0.456316,0.495402,0.449212,0.486706,0.432422,0.490683
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
output.to_csv('data/internal_external_citation.csv', compression='gzip')