In [1]:
# February 12st, 2020
# This script compares classification in each classification system
# Second level matches specifications of Nemet and Johnson 2012


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

import sys
sys.path.append('/home/rkogeyam/scripts/')
sys.path.append('scripts/')

# from determinants_scripts import classes, dtypes

from classification import preprocessing

import gzip

citation='data/cleanuspatentcitation.csv.gz'

usecols=['uuid', 'patent_id', 'citation_id']

file_cit=gzip.open(citation, 'rt')
# citation_df=pd.read_csv(file_cit, usecols=usecols, nrows=100000)
citation_df=pd.read_csv(file_cit, usecols=usecols)


In [3]:
# class_systems=['wipo']
# class_systems=['wipo', 'ipcr', 'cpc', 'nber']

In [4]:
%%time
# This approach uses too much memory
# Try something else

class_system='wipo'
    
classification = 'data/'+class_system+'.csv.gz'
file_class=gzip.open(classification, 'rt')
class_df=pd.read_csv(file_class, dtype=object, usecols=[0,1,2])

#join class to patent_id
df=preprocessing(class_df, citation_df)
df.rename(columns={df.columns[2]:'level1_pat', df.columns[3]:'level2_pat'}, inplace=True)

CPU times: user 29.1 s, sys: 861 ms, total: 30 s
Wall time: 29.9 s


In [5]:
class_df.reset_index(inplace=True)

In [6]:
#join class to citation_id
df=preprocessing(class_df, df, generality=False)

In [7]:
#classify far external 
far_ext=class_system+'_far_ext'
df[far_ext]=np.where(df['level1_pat'] != df[df.columns[3]], 1, 0)

#classify external
ext=class_system+'_ext'
df[ext]=np.where(df['level2_pat'] != df[df.columns[4]], 1, 0)

df=df[['uuid', far_ext, ext]].set_index('uuid')
print(df.head())

#generate the output df if wipo, join if others
if class_system=='wipo':
    output=df
else:
    output=output.join(df)
        

                           wipo_far_ext  wipo_ext
uuid                                             
0039pu9re4ynfr27uoqzn0i9b             0         0
00l0w5x2m3iw3h9l951t67lr4             0         0
00l8rnm5yieddwgbuam780gok             0         1
00ypbyofc9vkv4lx28dbpu1ao             0         0
0110d1ffwdqtnoe6rpujbbsk4             0         1


In [8]:
# %%time
# # This approach uses too much memory
# # Try something else

# for class_system in class_systems:
    
#     classification = 'data/'+class_system+'.csv.gz'
#     file_class=gzip.open(classification, 'rt')
#     class_df=pd.read_csv(file_class, dtype=object, usecols=[0,1,2])
    
#     #join class to patent_id
#     df=preprocessing(class_df, citation_df)
#     df.rename(columns={df.columns[2]:'level1_pat', df.columns[3]:'level2_pat'}, inplace=True)
    
#     #join class to citation_id
#     df=preprocessing(class_df, df, generality=False)
    
    
#     #classify far external 
#     far_ext=class_system+'_far_ext'
#     df[far_ext]=np.where(df['level1_pat'] != df[df.columns[3]], 1, 0)
    
#     #classify external
#     ext=class_system+'_ext'
#     df[ext]=np.where(df['level2_pat'] != df[df.columns[4]], 1, 0)
    
#     df=df[['uuid', far_ext, ext]].set_index('uuid')
#     print(df.head())

#     #generate the output df if wipo, join if others
#     if class_system=='wipo':
#         output=df
#     else:
#         output=output.join(df)
        

In [9]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77667 entries, 0039pu9re4ynfr27uoqzn0i9b to 00t6izymo7jy6kj4znwvkkuoq
Data columns (total 2 columns):
wipo_far_ext    77667 non-null int64
wipo_ext        77667 non-null int64
dtypes: int64(2)
memory usage: 1.8+ MB


In [10]:
output.describe()

Unnamed: 0,wipo_far_ext,wipo_ext
count,77667.0,77667.0
mean,0.200227,0.374419
std,0.400172,0.483976
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,1.0
max,1.0,1.0


In [11]:
output.to_csv('data/internal_external_citation.csv.gz', compression='gzip')