In [1]:
# Script to calculate originality and  application data
# This script prepares the classification data before merging
# Jan 16th, 2020
# Multiple classifications generate an error into the Generality calculation
# I am creating a version with the first class only 

# from the data dictionary

# patent_id: patent number
# field_id:  WIPO technology field ID as derived from crosswalk 
#            http://www.wipo.int/export/sites/www/ipstats/en/statistics/patents/xls/ipc_technology.xls
# sequence:  order in which WIPO technology field appears on patent

In [2]:
import pandas as pd
import numpy as np
import re

wipo = 'data/wipo.tsv.zip' #each patent class
dst= 'data/wipo.parquet.gz'
file_classes = 'data/classes.csv.gz' #custom made relationship table of classes

In [3]:
df=pd.read_csv(wipo, compression='zip', sep='\t', dtype='object')

In [4]:
#dtype='object' in read_csv do not set index as object
#so setting patent_id as index after read_csv avoids the issue
df=df.set_index('patent_id') 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9122912 entries, 10000000 to 9999999
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   field_id  object
 1   sequence  object
dtypes: object(2)
memory usage: 208.8+ MB


In [6]:
# filtering only the first class for each patent
#  (df['sequence']==0)
# it could be interesting to explore the other classes 
# correcting the format to two digits to avoid problems in the merge with classes
# convert field_id to categorical to 
df=df[df.sequence=='0']
df=df[['field_id']]
df['field_id']=df.field_id.apply('{:0>2}'.format)

In [7]:
# wipo is a classification system with only 1 level
# I artificially created a new class to make it comparable with Nemet and Johnson 2012
# In any case, it is possible to check robustness with the other class systems
usecols=['system', 'id', 'sector_title']
df_classes=pd.read_csv(file_classes, compression='gzip', usecols=usecols, index_col='id')

In [8]:
df_classes=df_classes[df_classes.system=='wipo_field_id']['sector_title'] #we are interested only in the sector_title info

In [9]:
# this merge will give the WIPO higher level (sector_title) classification group for each patent
df=df.merge(df_classes, left_on='field_id', right_index=True)

In [10]:
df.tail()

Unnamed: 0_level_0,field_id,sector_title
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9998313,5,Electrical Eng
9998316,5,Electrical Eng
9998748,5,Electrical Eng
9998825,5,Electrical Eng
9999780,5,Electrical Eng


In [11]:
# column names to make backward compatible with code
df.rename(columns={'sector_title': 'wipo_sector_id'}, inplace=True)
df.rename(columns={'field_id': 'wipo_field_id'}, inplace=True)

In [12]:
df['wipo_sector_id']=df.wipo_sector_id.astype('category')
df['wipo_field_id']=df.wipo_field_id.astype('category')

In [13]:
df[df.index=='10000000'] #simple test of index's type

Unnamed: 0_level_0,wipo_field_id,wipo_sector_id
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000000,10,Instruments


In [14]:
df.index = df.index.map(str)

In [15]:
df.to_parquet(dst, compression='gzip')