Import Libraries

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import string
string.punctuation

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Preprocessing

In [3]:
# read data

data = pd.read_json('patent.json', orient='records')
data = data[["title", "abstract_localized.text"]]
data.head()

ValueError: Trailing data

In [31]:
# remove missing value

data.dropna(inplace=True)
data.head()

Unnamed: 0,title,abstract_localized.text
0,Fire detection system based on artificial inte...,The present disclosure relates to a fire detec...
2,Embedding artificial intelligence for balancin...,Responsive to a CPU load of a specific access ...
4,Optimization techniques for artificial intelli...,"Methods, apparatuses and computer readable med..."
6,Method to create a secure distributed data val...,Methods and apparatus for validating paper for...
8,Virtual game assistant based on artificial int...,Virtual game dealers based on artificial intel...


In [32]:
# remove punctuation

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

data['clean_msg'] = data['title'].apply(lambda x:remove_punctuation(str(x)))
data['clean_msg'].head()

0    Fire detection system based on artificial inte...
2    Embedding artificial intelligence for balancin...
4    Optimization techniques for artificial intelli...
6    Method to create a secure distributed data val...
8    Virtual game assistant based on artificial int...
Name: clean_msg, dtype: object

In [33]:
# lowering text

data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())
data['msg_lower'].head()

0    fire detection system based on artificial inte...
2    embedding artificial intelligence for balancin...
4    optimization techniques for artificial intelli...
6    method to create a secure distributed data val...
8    virtual game assistant based on artificial int...
Name: msg_lower, dtype: object

In [34]:
# tokenization

data['msg_tokenized'] = data['msg_lower'].apply(nltk.word_tokenize)
data['msg_tokenized'].head

<bound method NDFrame.head of 0      [fire, detection, system, based, on, artificia...
2      [embedding, artificial, intelligence, for, bal...
4      [optimization, techniques, for, artificial, in...
6      [method, to, create, a, secure, distributed, d...
8      [virtual, game, assistant, based, on, artifici...
                             ...                        
490    [systems, and, methods, for, artificial, intel...
492    [wearable, computing, apparatus, for, augmente...
494    [system, and, method, of, providing, and, reco...
496    [method, and, server, for, optimizing, hyperpa...
498    [fallback, artificial, intelligence, system, f...
Name: msg_tokenized, Length: 266, dtype: object>

In [35]:
# remove stopwords

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

data['no_stopwords']= data['msg_tokenized'].apply(lambda x:remove_stopwords(x))
data['no_stopwords'].head

<bound method NDFrame.head of 0      [fire, detection, system, based, artificial, i...
2      [embedding, artificial, intelligence, balancin...
4      [optimization, techniques, artificial, intelli...
6      [method, create, secure, distributed, data, va...
8      [virtual, game, assistant, based, artificial, ...
                             ...                        
490    [systems, methods, artificial, intelligencegui...
492    [wearable, computing, apparatus, augmented, re...
494    [system, method, providing, recording, persona...
496    [method, server, optimizing, hyperparameter, t...
498    [fallback, artificial, intelligence, system, r...
Name: no_stopwords, Length: 266, dtype: object>

POS Tag

In [36]:
data['tagged'] = data['no_stopwords'].apply(nltk.pos_tag)
data['tagged'].head()

0    [(fire, NN), (detection, NN), (system, NN), (b...
2    [(embedding, VBG), (artificial, JJ), (intellig...
4    [(optimization, NN), (techniques, NNS), (artif...
6    [(method, NN), (create, NN), (secure, NN), (di...
8    [(virtual, JJ), (game, NN), (assistant, NN), (...
Name: tagged, dtype: object

POS Chunk

In [37]:
# lemmatization

lemmatizer = WordNetLemmatizer()

def lemmatize_word(word, tag):
    if tag.startswith('V'):
        return lemmatizer.lemmatize(word, 'v')  # Verb
    elif tag.startswith('N'):
        return lemmatizer.lemmatize(word, 'n')  # Noun
    else:
        return word

In [38]:
# pos chunk

def filter_verb_noun_pairs(tagged_sentence):
    verb_noun_pairs = []
    for i in range(len(tagged_sentence) - 1):
        word, tag = tagged_sentence[i]
        next_word, next_tag = tagged_sentence[i + 1]
        if tag.startswith('VB') and next_tag.startswith('NN'):
            verb = lemmatize_word(word, tag)
            noun = lemmatize_word(next_word, next_tag)
            verb_noun_pairs.append((verb, noun))
    return verb_noun_pairs

data['verb_noun_pairs'] = data['tagged'].apply(filter_verb_noun_pairs)
data['verb_noun_pairs'].head()

0                                                   []
2    [(balance, access), (process, load), (enable, ...
4                                                   []
6          [(distribute, data), (distribute, storage)]
8                                                   []
Name: verb_noun_pairs, dtype: object

In [39]:
# hasile

patent_task = data[["title", "verb_noun_pairs"]]
patent_task.to_excel('patent_task.xlsx', index=False)
patent_task

Unnamed: 0,title,verb_noun_pairs
0,Fire detection system based on artificial inte...,[]
2,Embedding artificial intelligence for balancin...,"[(balance, access), (process, load), (enable, ..."
4,Optimization techniques for artificial intelli...,[]
6,Method to create a secure distributed data val...,"[(distribute, data), (distribute, storage)]"
8,Virtual game assistant based on artificial int...,[]
...,...,...
490,Systems and methods for artificial intelligenc...,[]
492,Wearable computing apparatus for augmented rea...,"[(compute, apparatus), (augment, reality), (re..."
494,System and method of providing and recording p...,[]
496,Method and server for optimizing hyperparamete...,"[(optimize, hyperparameter)]"
