In [13]:
##############################################################################################
##                                                                                          ##
##                      Python Code to Test Classifier for USPTO Data                        ##
##                                                                                          ##
##############################################################################################


import os
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import stop_words

from nltk.stem import *

stemmer = PorterStemmer()

# Import Preprocessing #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

# Import Cross Val Libraries #
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2

# Import Classifiers #
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LassoCV, SGDClassifier, LinearRegression, LogisticRegression, RidgeCV, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

import warnings

from tqdm import tqdm, tqdm_notebook

import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report

#Import spacy for STOP WORDS AND WORD2VEC
from spacy.lang.en import STOP_WORDS


#Immport Keras for Neural Networks
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras import optimizers
from keras.callbacks import EarlyStopping

#Import Word2vec library en_core_web_md, NLTK and lemamtizers
import en_core_web_md
import spacy
nlp = en_core_web_md.load()
import nltk
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# To silence the warning messages
warnings.filterwarnings("ignore")

# Set default number of non-AI training data points
NUM_OF_NON_AI_PATENTS_FOR_TRAINING = 1600

# Set how many folds for cross validation
NUM_OF_SPLITS = 10

from bert_embedding import BertEmbedding

In [14]:
#Reading the file

TrainingData= pd.read_csv("TData_Export_USPTO.csv")
TrainingData.groupby(["is_AI"]).agg({"abstract":"count"})

Unnamed: 0_level_0,abstract
is_AI,Unnamed: 1_level_1
0,2062
1,1050


In [15]:
#Checks for key word presence
keywords1 = ['Neural Network', 'Neural Networks', 'Artificial Intelligence', 'Machine Learning', 'Reinforcement Learning',
            'Machine Learning', 'Pattern Recognition', 'Bayes', 'Computer Vision', 'Language Processing',
            'Natural Language', 'Data Mining']

keywords2 = ['Neural Network', 'Artificial Intelligence', 'Machine Learning', 'Reinforcement Learning','Pattern Recognition', 'Bayes',
        'Computer Vision', 'Language Processing','Natural Language', 'Data Mining', 'image grammar', 'physical symbol system', 'symbolic error analysis',
        'robot', 'pattern recognition', 'image matching', 'machine intelligence',
        'logic theorist', 'symbolic reasoning', 'symbolic error analysis', 'supervised learning',
        'pattern analysis', 'deep learning', 'collaborative system', 'symbol processing',
        'crowdsourcing', 'human computation', 'sensor network', 'neuromorphic computing',
        'decision making', 'sensor data fusion', 'layered control systems',
        'image processing', 'convolution network', 'recommendation system', 'speech recognition']

keywords = "|".join(keywords1 + keywords2)
#keywords = 'Neural Network|Artificial Intelligence|Machine Learning|Reinforcement Learning|Machine Learning|Pattern Recognition|Bayes|Computer Vision|Language Processing|Natural Language|Data Mining'
keywords = keywords.lower()

TrainingData["AI_Keyword"] = np.where(TrainingData["abstract"].str.lower().str.contains(keywords),1,0)

TrainingData["AI_Keyword"].mean(), TrainingData["is_AI"].mean()

TrainingData['app_number'] = np.where(TrainingData['app_number'].notnull(), TrainingData['app_number'], TrainingData['id'] )
ids =  TrainingData["app_number"].values.tolist()

processed_content_list = TrainingData['abstract'].values.tolist()
labels = TrainingData["is_AI"]
keyword_labels = TrainingData["AI_Keyword"]

In [16]:
#Lemmatizing the data

%%time

def tokenize_lemmatize(x):
    
    s = ''
    for i in nltk.word_tokenize(x):
        i = i.lower()
        i = lemmatizer.lemmatize(i)
        s += i+ ' '

    return s.strip()
            
TrainingData['abstract'] = TrainingData['abstract'].apply(lambda x: tokenize_lemmatize(x))

CPU times: user 5.6 s, sys: 91 ms, total: 5.69 s
Wall time: 6 s


In [35]:
#Importing keras libraries for fitting an LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.wrappers.scikit_learn import KerasClassifier

In [36]:
#Selecting max features and limiting length of embeddings to maxlen
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(TrainingData["abstract"])
list_tokenized_train = tokenizer.texts_to_sequences(TrainingData["abstract"])

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = keras.utils.to_categorical(labels)

#Building LSTM 

def create_network():
    model = Sequential()
    model.add(Embedding(max_features, 200))
    model.add(Bidirectional(LSTM(20, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(15, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(8, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(2, activation="sigmoid"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

neural_network = KerasClassifier(build_fn=create_network, 
                                 epochs=10, 
                                 batch_size=256, 
                                 verbose=0)

#model.summary()
#model.fit(X_t,y, batch_size=256, epochs= 10, validation_split=0.2)

In [43]:
#LSTM with cross validation
scores = cross_val_score(neural_network, X_t, y, cv=10)
print("Accuracy of LSTM with 10-fold cross validation and 6000 feature BOW model is:", np.mean(scores))


Accuracy of LSTM with 10-fold cross validation and 6000 feature BOW model is: 0.7850873957085034


In [44]:
#BERT Implementation
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange

In [61]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

In [76]:
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
BATCH_SIZE = 32
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [77]:
X_bert = convert_lines(TrainingData["abstract"], MAX_SEQUENCE_LENGTH, tokenizer)


  0%|          | 0/3112 [00:00<?, ?it/s][A
  1%|          | 24/3112 [00:00<00:12, 238.21it/s][A
  2%|▏         | 54/3112 [00:00<00:12, 252.27it/s][A
  3%|▎         | 80/3112 [00:00<00:12, 252.43it/s][A
  3%|▎         | 108/3112 [00:00<00:11, 258.31it/s][A
  4%|▍         | 137/3112 [00:00<00:11, 267.03it/s][A
  5%|▌         | 160/3112 [00:00<00:13, 227.01it/s][A
  6%|▌         | 182/3112 [00:00<00:14, 203.91it/s][A
  7%|▋         | 209/3112 [00:00<00:13, 218.20it/s][A
  8%|▊         | 237/3112 [00:00<00:12, 232.83it/s][A
  8%|▊         | 261/3112 [00:01<00:12, 231.99it/s][A
  9%|▉         | 286/3112 [00:01<00:12, 235.12it/s][A
 10%|▉         | 310/3112 [00:01<00:12, 228.16it/s][A
 11%|█         | 338/3112 [00:01<00:11, 241.01it/s][A
 12%|█▏        | 366/3112 [00:01<00:10, 249.79it/s][A
 13%|█▎        | 397/3112 [00:01<00:10, 264.50it/s][A
 14%|█▎        | 424/3112 [00:01<00:10, 256.11it/s][A
 14%|█▍        | 450/3112 [00:01<00:10, 253.89it/s][A
 15%|█▌        | 478/31

In [87]:
#Available max features
data.max().max()

30135

In [100]:
max_features = 35000
maxlen = 130
X_bert = pad_sequences(X_bert, maxlen=maxlen)
y = keras.utils.to_categorical(labels)
 
def create_network():
    model = Sequential()
    model.add(Embedding(max_features, 200))
    model.add(Bidirectional(LSTM(20, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(15, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(8, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(2, activation="sigmoid"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

neural_network = KerasClassifier(build_fn=create_network, 
                                 epochs=10, 
                                 batch_size=256, 
                                 verbose=0)

#model.summary()
#model.fit(X_t,y, batch_size=256, epochs= 10, validation_split=0.2)

In [101]:
scores = cross_val_score(neural_network, X_bert, y, cv=10)
print("Accuracy of LSTM with 10-fold cross validation and 35k feature BERT model is:", np.mean(scores))


Accuracy of LSTM with 10-fold cross validation and 35k feature BERT model is: 0.7516788296913194
