In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

src='data/ipcr.tsv'
dst='data/ipcr.csv'

In [2]:
# uuid:					unique id
# patent_id:			patent number
# classification_level:	ipc classification level (A = advanced level) 
# section:				ipc section (A = Human Necessitites, B = Performing Operations; Transporting, C = Chemistry; Metallurgy, D = Textiles; Paper, E = Fixed Constructions, F = Mechanical Engineering; Lighting; Heating; Weapons; Blasting, G = Physics, H = Electricity)
# ipc_class:			ipc class
# subclass:				ipc subclass
# main_group:			ipc group
# subgroup:				ipc subgroup
# symbol_position:				ipc symbol ( F = first or sole invention information IPC; L = any second or succeeding invention information IPC and any non-invention information IPC)
# classification_value:			ipc classification value ( I = invention information; N = non-invention information)
# classification_status:		ipc classification status ( B = Basic or Original)
# classification_data_source:	ipc classification data source ( H = Human - Generated; M = Machine - Generated; G = Generated via Software)
# action_date:					issue date of the patent grant
# ipc_version_indicator:		ipc version indicator
# sequence:						order in which ipc class appears in patent file

#IPC is hierarchical
#I am arbitrarily adopting the first three levels (section, class, subclass)

In [3]:
usecols=['patent_id', 'section', 'ipc_class', 'subclass', 'sequence']
df = pd.read_csv(src, sep='\t', usecols=usecols)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df=df[df['sequence']==0]

In [5]:
df=df.drop(['sequence'], axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5943638 entries, 0 to 12171205
Data columns (total 4 columns):
patent_id    object
section      object
ipc_class    object
subclass     object
dtypes: object(4)
memory usage: 226.7+ MB


In [7]:
columns=list(df)[1:-1]
for i in columns:
    print('\nTop 10', i)
    print(df[i].value_counts().head(10))
#     print(df[i].value_counts())
#     display(HTML(df[i].value_counts().to_html()))
#     display(HTML(df[i].value_counts().reset_index().to_html()))


Top 10 section
G    1494540
H    1305867
B     943327
A     796510
C     600732
F     448067
D     203406
E     150207
L        179
M        111
Name: section, dtype: int64

Top 10 ipc_class
1     590716
01    554451
6     328657
4     308417
06    304526
04    276058
61    225572
61    210389
2     175124
02    161940
Name: ipc_class, dtype: int64


In [8]:
columns

['section', 'ipc_class']

In [9]:
for i in columns:
    df[i].value_counts().plot.bar()

In [10]:
# display(HTML(df['main_group'].value_counts().reset_index().to_html()))

In [11]:
df=df.add_prefix('ipcr_')

In [12]:
df.rename(columns={ df.columns[0]: "id" }, inplace = True)

In [13]:
df = df.applymap(lambda x: str(x).upper())

In [14]:
df.columns

Index(['id', 'ipcr_section', 'ipcr_ipc_class', 'ipcr_subclass'], dtype='object')

In [15]:
df['ipcr_section'].value_counts()

G    1494544
H    1305869
B     943329
A     796510
C     600737
F     448067
D     203406
E     150208
L        179
M        111
N         89
P         86
R         79
K         58
Q         52
2         50
I         48
S         48
1         39
J         28
O         17
T         15
V         11
6          9
0          8
Z          8
4          7
X          5
3          4
W          4
U          4
5          3
8          3
Y          2
9          1
Name: ipcr_section, dtype: int64

In [16]:
df[~df['ipcr_section'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])]

Unnamed: 0,id,ipcr_section,ipcr_ipc_class,ipcr_subclass
5930,4864538,Q,11,C
9264,6256218,K,11,C
17273,5866254,I,22,C
22263,5697956,N,61,W
27702,7233174,K,3,K
28687,5337483,R,26,B
29514,5884865,R,64,C
39232,5937282,M,1,C
44543,5867543,O,0,O
61598,7990934,N,4,B


In [17]:
df=df[df['ipcr_section'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])]

In [18]:
# df.iloc[:, 1:-1].to_csv(dst)
df.set_index('id').to_csv(dst)