In [1]:
# February 12st, 2020
# This script compares classification in each classification system
# Second level matches specifications of Nemet and Johnson 2012


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

import sys
sys.path.append('/home/rkogeyam/scripts/')
sys.path.append('scripts/')

from determinants_scripts import classes, dtypes

from classification import preprocessing

citation='data/cleanuspatentcitation.csv'

usecols=['uuid', 'patent_id', 'citation_id']
df=pd.read_csv(citation, usecols=usecols)


In [3]:
%matplotlib inline

In [4]:
sns.set()

In [5]:
# class_systems=['wipo']
class_systems=['ipcr', 'cpc', 'nber']

In [6]:
#create classes dataset
class_df=pd.read_csv('data/wipo.csv', dtype=object, usecols=[0,1,2])
class_df=class_df.astype(str)
class_df=class_df.set_index('id')

for class_system in class_systems:
    classification = 'data/'+class_system+'.csv'
    class_x=pd.read_csv(classification, dtype=object, usecols=[0,1,2])
    class_x=class_x.astype(str)
    class_x=class_x.set_index('id')
    class_df=class_df.join(class_x)


In [7]:
class_df.dtypes

wipo_sector_id         object
wipo_field_id          object
ipcr_section           object
ipcr_ipc_class         object
cpc_section_id         object
cpc_subsection_id      object
nber_category_id       object
nber_subcategory_id    object
dtype: object

In [8]:
df=df.astype(str)
df=df.set_index('patent_id')

In [9]:
df.dtypes

uuid           object
citation_id    object
dtype: object

In [10]:
df=df.join(class_df)

In [11]:
print(df.head())
dimensions=list(df.columns)
dimensions.remove('uuid')
dimensions.remove('citation_id')
dimensions = [sub.replace('_id', '') for sub in dimensions] 

                              uuid citation_id wipo_sector_id wipo_field_id  \
3930271  mok3net9ll9apdxvjvf7r0mjb     2379430              4            33   
3930271  zvlxv3wtw969uvcd9azanhlwr     2782422              4            33   
3930272  6ddwoi89ailxhwrmj0ehi1z51     2560109              4            33   
3930272  77qebcired4jpt5gc24ynddl1     2545289              4            33   
3930272  af2k64rnid999g4g0hdutm75o     1549144              4            33   

        ipcr_section ipcr_ipc_class cpc_section_id cpc_subsection_id  \
3930271            A             41              A               A63   
3930271            A             41              A               A63   
3930272            A             47              A               A47   
3930272            A             47              A               A47   
3930272            A             47              A               A47   

        nber_category_id nber_subcategory_id  
3930271              NaN                 NaN 

In [12]:
ext_list = [sub + '_ext' for sub in dimensions] 

In [13]:
ext_list

['wipo_sector_ext',
 'wipo_field_ext',
 'ipcr_section_ext',
 'ipcr_ipc_class_ext',
 'cpc_section_ext',
 'cpc_subsection_ext',
 'nber_category_ext',
 'nber_subcategory_ext']

In [14]:
df = df.add_suffix('_pat')
df.rename(columns={'uuid_pat':'uuid', 'citation_id_pat':'citation_id'}, inplace=True)

In [15]:
df.columns

Index(['uuid', 'citation_id', 'wipo_sector_id_pat', 'wipo_field_id_pat',
       'ipcr_section_pat', 'ipcr_ipc_class_pat', 'cpc_section_id_pat',
       'cpc_subsection_id_pat', 'nber_category_id_pat',
       'nber_subcategory_id_pat'],
      dtype='object')

In [16]:
df=df.set_index('citation_id')
df=df.join(class_df)

In [17]:
x_1=2
x_2=10
for dimension in ext_list:
    print(list(df.columns)[x_1],list(df.columns)[x_2])
    df[dimension]=np.where(df.iloc[:,x_1] != df.iloc[:,x_2], 1, 0)
    df.loc[df.iloc[:,x_1].isnull(), dimension] = np.nan
    df.loc[df.iloc[:,x_2].isnull(), dimension] = np.nan 
    x_1+=1
    x_2+=1


wipo_field_id_pat wipo_field_id
ipcr_section_pat ipcr_section
ipcr_ipc_class_pat ipcr_ipc_class
cpc_section_id_pat cpc_section_id
cpc_subsection_id_pat cpc_subsection_id
nber_category_id_pat nber_category_id
nber_subcategory_id_pat nber_subcategory_id
wipo_sector_id wipo_sector_ext


In [18]:
df.isnull().sum()

uuid                              0
wipo_sector_id_pat          8854845
wipo_field_id_pat           8854845
ipcr_section_pat           11030867
ipcr_ipc_class_pat         11030867
cpc_section_id_pat          9304816
cpc_subsection_id_pat       9304816
nber_category_id_pat       23992622
nber_subcategory_id_pat    23992622
wipo_sector_id             18111089
wipo_field_id              18111089
ipcr_section               20186335
ipcr_ipc_class             20186335
cpc_section_id             18113386
cpc_subsection_id          18113386
nber_category_id           18355546
nber_subcategory_id        18355546
wipo_sector_ext            20696984
wipo_field_ext             24626201
ipcr_section_ext           24626201
ipcr_ipc_class_ext         21117545
cpc_section_ext            21117545
cpc_subsection_ext         34734596
nber_category_ext          34734596
nber_subcategory_ext       20696984
dtype: int64

In [19]:
df.head()

Unnamed: 0,uuid,wipo_sector_id_pat,wipo_field_id_pat,ipcr_section_pat,ipcr_ipc_class_pat,cpc_section_id_pat,cpc_subsection_id_pat,nber_category_id_pat,nber_subcategory_id_pat,wipo_sector_id,...,nber_category_id,nber_subcategory_id,wipo_sector_ext,wipo_field_ext,ipcr_section_ext,ipcr_ipc_class_ext,cpc_section_ext,cpc_subsection_ext,nber_category_ext,nber_subcategory_ext
0,eeggwd3pnsfcara1i60o42hdu,3,25,B,65,B,B65,5,59,,...,,,,,,,,,,
0,xjdvdj2y0lgdas51rewjzn405,3,29,A,1,A,A01,5,51,,...,,,,,,,,,,
4,e89bsa3x0yen50dcp9yogj3wp,3,28,B,41,B,B41,6,69,,...,,,,,,,,,,
6,a6crepaxejht8e16czfm0kytd,1,2,G,6,G,G11,4,45,,...,,,,,,,,,,
482,olkyv8qnteatjed767vodhmlz,1,4,H,4,H,H04,2,21,,...,,,,,,,,,,


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91453297 entries, 0 to re25727
Data columns (total 25 columns):
uuid                       object
wipo_sector_id_pat         object
wipo_field_id_pat          object
ipcr_section_pat           object
ipcr_ipc_class_pat         object
cpc_section_id_pat         object
cpc_subsection_id_pat      object
nber_category_id_pat       object
nber_subcategory_id_pat    object
wipo_sector_id             object
wipo_field_id              object
ipcr_section               object
ipcr_ipc_class             object
cpc_section_id             object
cpc_subsection_id          object
nber_category_id           object
nber_subcategory_id        object
wipo_sector_ext            float64
wipo_field_ext             float64
ipcr_section_ext           float64
ipcr_ipc_class_ext         float64
cpc_section_ext            float64
cpc_subsection_ext         float64
nber_category_ext          float64
nber_subcategory_ext       float64
dtypes: float64(8), object(17)
me

In [21]:
df.describe()

  interpolation=interpolation)


Unnamed: 0,wipo_sector_ext,wipo_field_ext,ipcr_section_ext,ipcr_ipc_class_ext,cpc_section_ext,cpc_subsection_ext,nber_category_ext,nber_subcategory_ext
count,70756310.0,66827100.0,66827100.0,70335750.0,70335750.0,56718700.0,56718700.0,70756313.0
mean,0.3719768,0.2943838,0.5659414,0.2807416,0.3861809,0.2500893,0.4069355,1.0
std,0.4833322,0.4557653,0.4956327,0.4493615,0.4868729,0.4330643,0.4912627,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,,,,,,,,
50%,,,,,,,,
75%,,,,,,,,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
df.set_index('uuid', inplace=True)
df.select_dtypes(include=[np.number]).to_csv('data/int_ext_cit_v2.csv')