# Deep Learning based Pipeline with Single Input for Multi-class Patent Classification
In this nootebook,  we used the full text of most important sections in patent  (title, abstract, technical fields, background, summary, and independent claim) as one input to the deep learning model.

In [1]:
import pandas as pd

df = pd.read_csv("../datasets/allITPatTextWith_Metadata.csv",  encoding = "ISO-8859-1", error_bad_lines=False)
df.columns =['ID','TI','AB','TECHF','BACKG','SUMM','CLMS','ICM','AY','IPC','REF','PA','INV']

df.dropna(subset=['ICM'], inplace=True)
df.fillna(value='', inplace=True)

df['TEXT'] = df['TI'] +'. '+ df['AB'] +'. '+ df['TECHF']+'. '+ df['BACKG']+'. '+ df['SUMM']+'. '+ df['CLMS']


df.fillna(value='', inplace=True)

df.TEXT.head()

0    Recognition code, particularly for a disk-shap...
1    Optical pickup apparatus for recording/reprodu...
2    Large capacity data sales mediation system, se...
3    BRIDGE FOR A CLIENT-SERVER ENVIRONMENT. A soft...
4    PROCESSOR HAVING SECTIONS OPERATING AT DIFFERE...
Name: TEXT, dtype: object

In [2]:
%%time
#preprocess of list fields
#convert all IPCs in df into one list
def toList(s):
    """
    this method is to convert the list of IPCs in each row from a string to a python List
    """
    s  = s.translate ({ord(c): " " for c in "[]"})
    ss= []
    for cls in s.strip().split(','):
        ss.append(cls.strip())
    return ss

#apply toList method on all rows in the DF
df['PA'] = df['PA'].map(lambda pa :   toList(pa))
df['INV'] = df['INV'].map(lambda inv :   toList(inv))

df.head()  

CPU times: user 2.53 s, sys: 56.8 ms, total: 2.58 s
Wall time: 2.59 s


In [3]:
%%time
#also preprocess of list fields
def metadataPreprocessing(input):
    newInput=' '
    for item in input:
        item = item.translate ({ord(c): " " for c in "!@#$%^&*()'[]{};:,./<>?\|`~°=\"+"})
        itms=' '
        for itm in item.split():
            itms= itms +' '+itm.strip()
        newInput = newInput + ' '+ itms.strip().replace(' ','_')
    return newInput.strip()

df['PA'] = df['PA'].map(lambda pa :   metadataPreprocessing(pa))
df['INV'] = df['INV'].map(lambda inv :   metadataPreprocessing(inv))

df.head()

CPU times: user 5.49 s, sys: 22 ms, total: 5.51 s
Wall time: 5.52 s


In [4]:
#preprocessing 
standardStopwordFile = "sources/stopwords/stopwords-all.txt"
#generalWordsFile = "sources/Clariant/generalWords.txt"

#loading terms from a file to a set
def get_terms_from_file(filePath):
    terms = set(line.strip() for line in open(filePath))
    return terms

#remove undiserd terms
def remove_terms(termSet, phrase):
    newPhrase = ""
    for term in phrase.split():
        if term.strip() not in termSet and len(term.strip())>2:
            newPhrase = newPhrase + " " + term.strip()



def clean_texts(doc):
    #Remove punctuation from texts
    doc = doc.translate ({ord(c): ' ' for c in "0123456789!@#$%^&*()'/[]{};:,./<>?\|`~°=\"+"})
    # split into tokens by white space
    tokens = doc.lower().strip().split()
    
    # filter out stop words
    stop_words = get_terms_from_file(standardStopwordFile)
    #generalStopwords = get_terms_from_file(generalWordsFile)

    
    tokens = [w.strip('-')  for w in tokens if  w not in stop_words ]
    # filter out short and long  tokens
    output = [word for word in tokens if len(word.strip()) > 2 and len(word) < 30 ]
    output = " ".join(output)
    #apply stemming
    #output = stem_text(output)
    return output



In [5]:
%%time
apply simple preprocessing on text
df['TI'] = df['TI'].map(lambda line : clean_texts(line))
df['AB'] = df['AB'].map(lambda line : clean_texts(line))
df['TECHF'] = df['TECHF'].map(lambda line : clean_texts(line))
df['BACKG'] = df['BACKG'].map(lambda line : clean_texts(line))
df['SUMM'] = df['SUMM'].map(lambda line : clean_texts(line))
df['CLMS'] = df['CLMS'].map(lambda line : clean_texts(line))


df.head()

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.54 µs


In [6]:
#process the ICM codes and #related-patents
df['ICM'] = df['ICM'].map(lambda icmCode : icmCode[:4])  

df_ICMs = df.groupby(['ICM'])
df_ICMs = df_ICMs.size().reset_index(name='Docs')

print(len(df_ICMs.ICM.unique()))
#filter out the rows with #docs less than N documents
df_ICMOut =  df_ICMs[df_ICMs['Docs'] >= 500]

#filter out rows of the original dataframe df accordding to df_ICMOut
ICMList = df_ICMOut['ICM'].tolist()
df = df[df.ICM.isin(ICMList)]

icmCount = df_ICMs.count().tolist()[0]

print( 'number of remaining documents in the dataset is: ',len(df))

print('Number of unique labels is: ', len(df.ICM.unique()))

581
number of remaining documents in the dataset is:  403726
Number of unique labels is:  42


In [7]:
#preprocess all documents
#df['TEXT'] = df['TEXT'].map(lambda line : clean_texts(line))
from sklearn.utils import shuffle

df = shuffle(df)

df.head()

Unnamed: 0,ID,TI,AB,TECHF,BACKG,SUMM,CLMS,ICM,AY,IPC,REF,PA,INV,TEXT
365576,PCT1995008810-0,auditing,auditing interface means steps providing audit...,auditing auditing auditing enables create proc...,businesses industries legislative regulatory q...,ofauditing interface means steps providing aud...,auditing interface means steps providing audit...,G06F,1994,"[G06F017-40, G06F017-60, G06Q0030-00, G09B0007...",,,WRIGHT_GREGORY_ALLAN,auditing. auditing interface means steps provi...
307355,PCT2011151924-0,,executing predetermined processed preset key p...,,predetermined target executing predetermined c...,tech mentioned ipsec enciphering encryption ke...,key advance executing predetermined timing pre...,H04L,2010,"[H04L0009-16, H04L0009-08]",,,MIZUMAKI_Masayoshi,. executing predetermined processed preset key...
131569,PCT2008097717-0,non-imaging light collector electro-optical sc...,extended working range electro-optical scanner...,concerns electro-optical scanner reading bar c...,laser bar reader scanner reading bar decoding ...,concerns extended range electro-optical scanne...,,G06K,2008,"[G06K0007-10, G06K0007-14]",,SYMBOL_TECHNOLOGIES_INC_BARKAN_Edward_D_DRZYMA...,BARKAN_Edward_D_DRZYMALA_Mark,non-imaging light collector electro-optical sc...
55896,PCT2002080552-0,virtual personalized channel,management creates personalized channel end-us...,virtual personalized channelfield personalizin...,artphilips electronics markets personal video ...,pvr decribed lets watch live programs recorded...,management creating personalized channel end-u...,H04N,2002,"[H04N007-173, G11B0027-00, H04L0012-28, H04N00...",,KONINKLIJKE_PHILIPS_ELECTRONICS_N_V,VAN_EE_Jan,virtual personalized channel. management creat...
59881,PCT2006070048-0,limiting traffic communications systems,limiting traffic communications based monitori...,limiting traffic communications exclusively li...,enables entities equipment nodes comprise voic...,provided limiting traffic communications monit...,,G06F,2005,"[G06F0011-00, H04L0012-26, H04L0029-06, H04L00...",,NOKIA_CORPORATION_WANG_Hao_KAHADUWE_Ajit,WANG_Hao_KAHADUWE_Ajit,limiting traffic communications systems. limit...


In [8]:
# lets take n% data as training and remaining m% for test.
train_size = int(len(df) * .9)

train_TI = df['TEXT'][:train_size]
train_ICM= df['ICM'][:train_size]
train_ID= df['ID'][:train_size]

test_TI = df['TEXT'][train_size:]
test_ICM = df['ICM'][train_size:]
test_ID = df['ID'][train_size:]


#metadata
train_pa_series = df['PA'][:train_size]
test_pa_series = df['PA'][train_size:]

train_inv_series = df['INV'][:train_size]
test_inv_series = df['INV'][train_size:]


print(train_TI.shape)
print(test_TI.shape)

#free up some memory space
#df.iloc[0:0]

(363353,)
(40373,)


In [9]:
#preparing text documents and labels for deep learning


from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelBinarizer




Using TensorFlow backend.


In [10]:
#PA
pa_inv_vocab_size = 2000
pa_tokenizer = Tokenizer(num_words=pa_inv_vocab_size,  filters='!"#$%&()*+,./:;<=>?@[\]^`{|}~', lower=True, split=' ', char_level=False, oov_token=None)
pa_tokenizer.fit_on_texts(train_pa_series)
train_pa_one_hot =pa_tokenizer.texts_to_matrix(train_pa_series)
test_pa_one_hot =pa_tokenizer.texts_to_matrix(test_pa_series)


#INV
inv_tokenizer = Tokenizer(num_words=pa_inv_vocab_size,  filters='!"#$%&()*+,./:;<=>?@[\]^`{|}~', lower=True, split=' ', char_level=False, oov_token=None)
inv_tokenizer.fit_on_texts(train_inv_series)
train_inv_one_hot =inv_tokenizer.texts_to_matrix(train_inv_series)
test_inv_one_hot =inv_tokenizer.texts_to_matrix(test_inv_series)


print('Found %s words in PA' % len(pa_tokenizer.word_index))
print('Found %s words in INV' % len(inv_tokenizer.word_index))

Found 97672 words in PA
Found 280522 words in INV


In [11]:
%%time

#Title
TI_tokenizer = Tokenizer(num_words=50000,  filters='!"#$%&()*+,./:;<=>?@[\]^`{|}~_', lower=True, split=' ', char_level=False, oov_token=None)
TI_tokenizer.fit_on_texts(train_TI)
encoded_train_TI = TI_tokenizer.texts_to_sequences(train_TI)
encoded_test_TI = TI_tokenizer.texts_to_sequences(test_TI)
#convert all sequences in a list into the same length
TI_train = pad_sequences(encoded_train_TI,  maxlen=100, padding='post')
TI_test = pad_sequences(encoded_test_TI,  maxlen=100, padding='post')


CPU times: user 6min 28s, sys: 3.59 s, total: 6min 31s
Wall time: 6min 32s


In [12]:
%%time
# representing the labels/classes in the numeric format by scikit-learn - LabelBinarizer class
# Convert 1-dimensional class arrays to n-dimensional(#classes) class matrices
encoder = LabelBinarizer()
encoder.fit(train_ICM)
y_train = encoder.transform(train_ICM)
y_test = encoder.transform(test_ICM)

#get the unique number of labels in the training set
classesList = train_ICM.tolist()
classesList =set(classesList)
num_classes = len(classesList)

CPU times: user 4 s, sys: 101 ms, total: 4.1 s
Wall time: 4.11 s


In [13]:
import numpy as np

def load_embedding_model(filePath):
    embeddings_index = dict()
    f = open(filePath, encoding='utf8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
    return embeddings_index

def create_embedding_matrix(tokenizer, embeddings_index, vocab_size_embbs, dim_size):
    embeddings_matrix = np.zeros((vocab_size_embbs, dim_size))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector[0:dim_size]
    
    return embeddings_matrix


In [14]:
%%time
## load the whole embedding into memory and get matrix
embedding_index = load_embedding_model('../models/w2v/phrase/patWordPhrase2VecModel.txt')


CPU times: user 1min 29s, sys: 3.33 s, total: 1min 32s
Wall time: 1min 32s


In [15]:
%%time

#create TITLE embedding Matrix
#vocab_size for embedding
vocab_size_embb = len(TI_tokenizer.word_index) + 1

TI_embeddings_matrix = create_embedding_matrix(TI_tokenizer,
                                              embedding_index,
                                              vocab_size_embb,
                                              100)




CPU times: user 1.11 s, sys: 758 ms, total: 1.87 s
Wall time: 1.87 s


In [16]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Embedding, BatchNormalization, ELU, Concatenate
from keras.layers import LSTM, Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.layers.core import Dropout



In [17]:
%%time
#TITLE 
sequence_len =100
dropout_pct =  0.4

TI_embedding_layer_input = Input(shape=(sequence_len,), name='TI_embed_input')
TI_embedding_layer = Embedding(input_dim=len(TI_tokenizer.word_index) + 1,
                        output_dim=100, # Dimension of the dense embedding
                        weights=[TI_embeddings_matrix],
                        input_length=100)(TI_embedding_layer_input)

lstm_size = 64
TI_deep = LSTM(lstm_size,
            dropout=dropout_pct,
            recurrent_dropout=dropout_pct,
            return_sequences=False,
            name='LSTM_TI')(TI_embedding_layer)

TI_deep = Dense(300, activation=None)(TI_deep)
TI_deep = Dropout(dropout_pct)(TI_deep)
TI_deep = BatchNormalization()(TI_deep)
TI_deep = ELU()(TI_deep)


CPU times: user 5.21 s, sys: 10.9 s, total: 16.1 s
Wall time: 6.99 s


In [18]:
dropout_pct =  0.4
pa_input = Input(shape=(train_pa_one_hot.shape[1],), name='pa_input') 
pas = Dense(32,input_dim=train_pa_one_hot.shape[1], activation=None)(pa_input) 
pas = Dropout(dropout_pct)(pas)
pas = BatchNormalization()(pas)
pas = ELU()(pas)

#inv
inv_input = Input(shape=(train_inv_one_hot.shape[1],), name='inv_input') 
invs = Dense(32,input_dim=train_inv_one_hot.shape[1], activation=None)(pa_input) 
invs = Dropout(dropout_pct)(invs)
invs = BatchNormalization()(invs)

print('pa_input and inv_input layers are finished')

pa_input and inv_input layers are finished


In [19]:
import keras_metrics as km
#contacting two input models
#model_inputs_to_concat = [TI_deep, AB_deep, TECHF_deep, BACKG_deep, SUMM_deep, CLMS_deep] #invs , pas, invs
#final_layer =  Concatenate(name='concatenated_layer')(model_inputs_to_concat)

output = Dense(128, activation=None)(TI_deep)
output = Dropout(dropout_pct)(output)
output = BatchNormalization()(output)
output = ELU()(output)
output = Dense(num_classes, activation='softmax')(output)

model = Model(inputs=[TI_embedding_layer_input
                     ],
              outputs=output, name='model')
model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy', km.categorical_precision(), km.categorical_recall()])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
TI_embed_input (InputLayer)  (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          129596000 
_________________________________________________________________
LSTM_TI (LSTM)               (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 300)               19500     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
elu_1 (ELU)                  (None, 300)               0         
__________

In [20]:
%%time
batch_size= 500 
num_epochs = 20

history = model.fit(x={'TI_embed_input': TI_train
             
            },
          y=y_train,
          batch_size=batch_size,
          epochs=num_epochs,
          validation_data=
          ({'TI_embed_input': TI_test
            
            },
           y_test))


  num_elements)


Train on 363353 samples, validate on 40373 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1h 1s, sys: 8min 29s, total: 1h 8min 31s
Wall time: 38min 41s


In [21]:
from sklearn.datasets import make_circles
from keras.models import Sequential
from keras.layers import Dense
from matplotlib import pyplot

In [22]:
import keras_metrics as km

In [23]:
import keras_metrics as km

#contacting two input models
model_inputs_to_concat = [TI_deep, pas, invs] #invs , pas, invs
final_layer =  Concatenate(name='concatenated_layer')(model_inputs_to_concat)

output = Dense(128, activation=None)(final_layer)
output = Dropout(dropout_pct)(output)
output = BatchNormalization()(output)
output = ELU()(output)
output = Dense(num_classes, activation='softmax')(output)

model2 =Model(inputs=[ TI_embedding_layer_input,
                     pa_input,
                      inv_input],
              outputs=output, name='model')
model2.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                       metrics=['accuracy', km.categorical_precision(), km.categorical_recall()])
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
TI_embed_input (InputLayer)     (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 100)     129596000   TI_embed_input[0][0]             
__________________________________________________________________________________________________
LSTM_TI (LSTM)                  (None, 64)           42240       embedding_1[0][0]                
__________________________________________________________________________________________________
pa_input (InputLayer)           (None, 2000)         0                                            
__________________________________________________________________________________________________
dense_1 (D

In [24]:
%%time
batch_size= 500 
num_epochs = 20


history2 = model2.fit(x={'TI_embed_input': TI_train,
             'pa_input': train_pa_one_hot,
             'inv_input': train_inv_one_hot
            },
          y=y_train,
          batch_size=batch_size,
          epochs=num_epochs,
          validation_data=
          ({'TI_embed_input': TI_test,
            'pa_input': test_pa_one_hot,
            'inv_input': test_inv_one_hot
            },
           y_test))

  num_elements)


Train on 363353 samples, validate on 40373 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1h 2min 45s, sys: 8min 33s, total: 1h 11min 19s
Wall time: 41min 6s
