In [1]:
# February 12st, 2020
# This script compares classification in each classification system
# Second level matches specifications of Nemet and Johnson 2012


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

import sys
sys.path.append('/home/rkogeyam/scripts/')
sys.path.append('scripts/')

from determinants_scripts import classes, dtypes

from classification import preprocessing

citation='data/cleanuspatentcitation.csv'

usecols=['uuid', 'patent_id', 'citation_id']
citation_df=pd.read_csv(citation, usecols=usecols)


In [3]:
%matplotlib inline

In [4]:
sns.set()

In [5]:
# class_systems=['wipo']
class_systems=['wipo', 'ipcr', 'cpc', 'nber']

In [6]:
%%time
# This approach uses too much memory
# Try something else

for class_system in class_systems:
    
    classification = 'data/'+class_system+'.csv'
    
    class_df=pd.read_csv(classification, dtype=object, usecols=[0,1,2])
    
    #join class to patent_id
    df=preprocessing(class_df, citation_df)
    df.rename(columns={df.columns[2]:'level1_pat', df.columns[3]:'level2_pat'}, inplace=True)
    
    #join class to citation_id
    df=preprocessing(class_df, df, generality=False)
    
    
    #classify far external 
    far_ext=class_system+'_far_ext'
    df[far_ext]=np.where(df['level1_pat'] != df[df.columns[3]], 1, 0)
    
    #classify external
    ext=class_system+'_ext'
    df[ext]=np.where(df['level2_pat'] != df[df.columns[4]], 1, 0)
    
    df=df[['uuid', far_ext, ext]].set_index('uuid')
    print(df.head())

    #generate the output df if wipo, join if others
    if class_system=='wipo':
        output=df
    else:
        output=output.join(df)
        

                           wipo_far_ext  wipo_ext
uuid                                             
51l5izpbtjpzajh1dfklk0e4n             0         0
64thngwfec9a022vjzcqch2jt             0         0
7vvfabcwix3pfrrz9ov1uay6l             0         1
6l5jnj0hitatw249pylq26ot8             1         1
2rildowxihjnojhj8pk5s77pp             0         0
                           ipcr_far_ext  ipcr_ext
uuid                                             
51l5izpbtjpzajh1dfklk0e4n             0         0
64thngwfec9a022vjzcqch2jt             0         0
7vvfabcwix3pfrrz9ov1uay6l             0         0
4tih8qtmlqq7uqfkkusqcdnr2             1         1
6l5jnj0hitatw249pylq26ot8             0         0
                           cpc_far_ext  cpc_ext
uuid                                           
51l5izpbtjpzajh1dfklk0e4n            0        0
64thngwfec9a022vjzcqch2jt            0        1
7vvfabcwix3pfrrz9ov1uay6l            0        0
6l5jnj0hitatw249pylq26ot8            0        1
2rildowxihjn

In [7]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15448661 entries, 51l5izpbtjpzajh1dfklk0e4n to 4i52a0z3d8xym9fr9x4l0gfi3
Data columns (total 8 columns):
wipo_far_ext    int64
wipo_ext        int64
ipcr_far_ext    float64
ipcr_ext        float64
cpc_far_ext     float64
cpc_ext         float64
nber_far_ext    float64
nber_ext        float64
dtypes: float64(6), int64(2)
memory usage: 1.0+ GB


In [8]:
output.describe()

  interpolation=interpolation)


Unnamed: 0,wipo_far_ext,wipo_ext,ipcr_far_ext,ipcr_ext,cpc_far_ext,cpc_ext,nber_far_ext,nber_ext
count,15448660.0,15448660.0,14579660.0,14579660.0,15447520.0,15447520.0,12784480.0,12784480.0
mean,0.1970766,0.3720058,0.2940687,0.566674,0.2806458,0.3863544,0.2500311,0.4069588
std,0.3977906,0.48334,0.455623,0.4955347,0.4493148,0.4869134,0.4330307,0.4912671
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,,,,,,
50%,0.0,0.0,,,,,,
75%,0.0,1.0,,,,,,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
output.to_csv('data/internal_external_citation.csv')