In [29]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
import pandas as pd


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joyces\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joyces\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joyces\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
data = pd.read_pickle('./data/Patent_Dataset.pkl')

In [6]:
data

Unnamed: 0,patent_id,abstract,classifications_cpc,publication_date
0,US06841689-20050111,There is provided a process for the addition o...,[],2005-01-11
0,US06895631-20050524,A buffing pad that includes a substantially co...,[],2005-05-24
0,US06974769-20051213,Conductive structures in features of an insula...,[],2005-12-13
0,US07006202-20060228,A mask holder for irradiating UV-rays is discl...,"[G03B27/42, G03B27/58, G03B27/62]",2006-02-28
0,US07096910-20060829,A tyre for a vehicle wheel includes a carcass ...,"[B29D30/00, B60C9/00, B60C9/02]",2006-08-29
...,...,...,...,...
0,US09797302-20171024,An engine control system with a variable turbo...,"[F02B37/22, F02B37/24, F02D2200/501, F02D41/00...",2017-10-24
0,US09812826-20171107,A connector for electrically connecting with a...,"[H01R107/00, H01R13/631, H01R13/6581, H01R2107...",2017-11-07
0,US09852520-20171226,A method and apparatus are provided for implem...,"[G06F15/173, G06T9/00, H04L29/06, H04L65/602]",2017-12-26
0,US09895025-20180220,The motor unit ( 2 ) comprises a motor unit co...,"[A47J2043/04409, A47J2043/04427, A47J43/00, A4...",2018-02-20


In [7]:
sample_para = '''This is a small chunk of text. We will be implementing some of the preprocessing on it. Once we are happy, we will use similar approach on the larger body of text'''

In [8]:
# Sentence tokenization

sentence = nltk.sent_tokenize(sample_para)
sentence

['This is a small chunk of text.',
 'We will be implementing some of the preprocessing on it.',
 'Once we are happy, we will use similar approach on the larger body of text']

In [10]:
# Word tokenization

words = nltk.word_tokenize(sample_para)
words

['This',
 'is',
 'a',
 'small',
 'chunk',
 'of',
 'text',
 '.',
 'We',
 'will',
 'be',
 'implementing',
 'some',
 'of',
 'the',
 'preprocessing',
 'on',
 'it',
 '.',
 'Once',
 'we',
 'are',
 'happy',
 ',',
 'we',
 'will',
 'use',
 'similar',
 'approach',
 'on',
 'the',
 'larger',
 'body',
 'of',
 'text']

In [12]:
# Lowecasing the words (via list comprehension)

words = [word.lower() for word in words]
words

['this',
 'is',
 'a',
 'small',
 'chunk',
 'of',
 'text',
 '.',
 'we',
 'will',
 'be',
 'implementing',
 'some',
 'of',
 'the',
 'preprocessing',
 'on',
 'it',
 '.',
 'once',
 'we',
 'are',
 'happy',
 ',',
 'we',
 'will',
 'use',
 'similar',
 'approach',
 'on',
 'the',
 'larger',
 'body',
 'of',
 'text']

In [13]:
# Remove punctuation and numbers (can also do 'isalphanumeric' if you want to keep the numbers)

words = [word for word in words if word.isalpha()]
words

['this',
 'is',
 'a',
 'small',
 'chunk',
 'of',
 'text',
 'we',
 'will',
 'be',
 'implementing',
 'some',
 'of',
 'the',
 'preprocessing',
 'on',
 'it',
 'once',
 'we',
 'are',
 'happy',
 'we',
 'will',
 'use',
 'similar',
 'approach',
 'on',
 'the',
 'larger',
 'body',
 'of',
 'text']

In [24]:
# Stemming / Lemmatization

stem_words = ['eating', 'eats', 'computer', 'information', 'informative', 'ate']

In [25]:
# PoretStemmer was the origiinal stemming technique (snowballstemmer came later)

stemming = PorterStemmer()

In [26]:
for word in stem_words:
    print(word + '  ... >  ' + stemming.stem(word))

eating  ... >  eat
eats  ... >  eat
computer  ... >  comput
information  ... >  inform
informative  ... >  inform
ate  ... >  ate


In [30]:
snowball_stemmer = SnowballStemmer('english')


In [31]:
for word in stem_words:
    print(word + '.....    >  ' + snowball_stemmer.stem(word))

eating.....    >  eat
eats.....    >  eat
computer.....    >  comput
information.....    >  inform
informative.....    >  inform
ate.....    >  ate


In [27]:
lemmatizer = WordNetLemmatizer()

In [28]:
# pos argument specificies that it is treating them as verbs

for word in stem_words:
    print(word + '  ... >  ' + lemmatizer.lemmatize(word, pos='v'))

eating  ... >  eat
eats  ... >  eat
computer  ... >  computer
information  ... >  information
informative  ... >  informative
ate  ... >  eat


In [32]:
words

['this',
 'is',
 'a',
 'small',
 'chunk',
 'of',
 'text',
 'we',
 'will',
 'be',
 'implementing',
 'some',
 'of',
 'the',
 'preprocessing',
 'on',
 'it',
 'once',
 'we',
 'are',
 'happy',
 'we',
 'will',
 'use',
 'similar',
 'approach',
 'on',
 'the',
 'larger',
 'body',
 'of',
 'text']

In [33]:
lemmatizer = WordNetLemmatizer()

In [36]:
words = [lemmatizer.lemmatize(word, pos='v') for word in words]
words

['this',
 'be',
 'a',
 'small',
 'chunk',
 'of',
 'text',
 'we',
 'will',
 'be',
 'implement',
 'some',
 'of',
 'the',
 'preprocessing',
 'on',
 'it',
 'once',
 'we',
 'be',
 'happy',
 'we',
 'will',
 'use',
 'similar',
 'approach',
 'on',
 'the',
 'larger',
 'body',
 'of',
 'text']

In [43]:
# This is the list of stop words we are searching for

stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [45]:
# Remove stopwqords (should be done before punctuation really)

words = [word for word in words if word not in stop_words]
words

['small',
 'chunk',
 'text',
 'implement',
 'preprocessing',
 'happy',
 'use',
 'similar',
 'approach',
 'larger',
 'body',
 'text']

In [46]:
('  '.join(words))

'small  chunk  text  implement  preprocessing  happy  use  similar  approach  larger  body  text'

In [55]:
# Implement into patent dataset

abstract_tokens = []
for abstract in data['abstract']:
    abstract_tokens.append(word_tokenize(abstract))

data['abstract_token']  = abstract_tokens
data

Unnamed: 0,patent_id,abstract,classifications_cpc,publication_date,abstract_token
0,US06841689-20050111,There is provided a process for the addition o...,[],2005-01-11,"[There, is, provided, a, process, for, the, ad..."
0,US06895631-20050524,A buffing pad that includes a substantially co...,[],2005-05-24,"[A, buffing, pad, that, includes, a, substanti..."
0,US06974769-20051213,Conductive structures in features of an insula...,[],2005-12-13,"[Conductive, structures, in, features, of, an,..."
0,US07006202-20060228,A mask holder for irradiating UV-rays is discl...,"[G03B27/42, G03B27/58, G03B27/62]",2006-02-28,"[A, mask, holder, for, irradiating, UV-rays, i..."
0,US07096910-20060829,A tyre for a vehicle wheel includes a carcass ...,"[B29D30/00, B60C9/00, B60C9/02]",2006-08-29,"[A, tyre, for, a, vehicle, wheel, includes, a,..."
...,...,...,...,...,...
0,US09797302-20171024,An engine control system with a variable turbo...,"[F02B37/22, F02B37/24, F02D2200/501, F02D41/00...",2017-10-24,"[An, engine, control, system, with, a, variabl..."
0,US09812826-20171107,A connector for electrically connecting with a...,"[H01R107/00, H01R13/631, H01R13/6581, H01R2107...",2017-11-07,"[A, connector, for, electrically, connecting, ..."
0,US09852520-20171226,A method and apparatus are provided for implem...,"[G06F15/173, G06T9/00, H04L29/06, H04L65/602]",2017-12-26,"[A, method, and, apparatus, are, provided, for..."
0,US09895025-20180220,The motor unit ( 2 ) comprises a motor unit co...,"[A47J2043/04409, A47J2043/04427, A47J43/00, A4...",2018-02-20,"[The, motor, unit, (, 2, ), comprises, a, moto..."


In [57]:
# Lowercasing

lower_token = []

for tokens in data['abstract_token']:
    lower_token.append([token.lower() for token in tokens])

data['abstract_token'] = lower_token
data

Unnamed: 0,patent_id,abstract,classifications_cpc,publication_date,abstract_token
0,US06841689-20050111,There is provided a process for the addition o...,[],2005-01-11,"[there, is, provided, a, process, for, the, ad..."
0,US06895631-20050524,A buffing pad that includes a substantially co...,[],2005-05-24,"[a, buffing, pad, that, includes, a, substanti..."
0,US06974769-20051213,Conductive structures in features of an insula...,[],2005-12-13,"[conductive, structures, in, features, of, an,..."
0,US07006202-20060228,A mask holder for irradiating UV-rays is discl...,"[G03B27/42, G03B27/58, G03B27/62]",2006-02-28,"[a, mask, holder, for, irradiating, uv-rays, i..."
0,US07096910-20060829,A tyre for a vehicle wheel includes a carcass ...,"[B29D30/00, B60C9/00, B60C9/02]",2006-08-29,"[a, tyre, for, a, vehicle, wheel, includes, a,..."
...,...,...,...,...,...
0,US09797302-20171024,An engine control system with a variable turbo...,"[F02B37/22, F02B37/24, F02D2200/501, F02D41/00...",2017-10-24,"[an, engine, control, system, with, a, variabl..."
0,US09812826-20171107,A connector for electrically connecting with a...,"[H01R107/00, H01R13/631, H01R13/6581, H01R2107...",2017-11-07,"[a, connector, for, electrically, connecting, ..."
0,US09852520-20171226,A method and apparatus are provided for implem...,"[G06F15/173, G06T9/00, H04L29/06, H04L65/602]",2017-12-26,"[a, method, and, apparatus, are, provided, for..."
0,US09895025-20180220,The motor unit ( 2 ) comprises a motor unit co...,"[A47J2043/04409, A47J2043/04427, A47J43/00, A4...",2018-02-20,"[the, motor, unit, (, 2, ), comprises, a, moto..."


In [64]:
# Remove stop words

stop_words_removed = []

for tokens in data['abstract_token']:
    stop_words_removed.append([token for token in tokens if token not in stop_words])


data['abstract_token'] = stop_words_removed

data['abstract_token'].iloc[0]


['provided',
 'process',
 'addition',
 'nucleophile',
 'across',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'solvent',
 ':',
 ')',
 'nucleophile',
 ';',
 'ii',
 ')',
 'compound',
 'comprising',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'iii',
 ')',
 'catalyst',
 'comprising',
 'soluble',
 'polymer',
 'polyamino',
 'acid',
 '.']

In [66]:
# Remove punctuation and numbers
 
removed_numb_punct = []

for tokens in data['abstract_token']:
    removed_numb_punct.append([token for token in tokens if token.isalpha()])

data['abstract_token'] = removed_numb_punct

data['abstract_token'].iloc[0]


['provided',
 'process',
 'addition',
 'nucleophile',
 'across',
 'electron',
 'poor',
 'double',
 'bond',
 'michael',
 'addition',
 'comprising',
 'contacting',
 'solvent',
 'nucleophile',
 'ii',
 'compound',
 'comprising',
 'electron',
 'poor',
 'double',
 'bond',
 'iii',
 'catalyst',
 'comprising',
 'soluble',
 'polymer',
 'polyamino',
 'acid']

In [67]:
# Lemmatizer

lemmatizer = WordNetLemmatizer()

In [70]:
# Lemmatize the tokens
words_lemmatized = []

for token_list in data['abstract_token']:
    lemmatized_list = [lemmatizer.lemmatize(token, pos='v') for token in token_list]
    words_lemmatized.append(lemmatized_list)

# Update the DataFrame
data['abstract_token'] = words_lemmatized

# Print the first element
print(data['abstract_token'].iloc[0])  # Using .iloc[0] if data['abstract_token'] is a Series

['provide', 'process', 'addition', 'nucleophile', 'across', 'electron', 'poor', 'double', 'bond', 'michael', 'addition', 'comprise', 'contact', 'solvent', 'nucleophile', 'ii', 'compound', 'comprise', 'electron', 'poor', 'double', 'bond', 'iii', 'catalyst', 'comprise', 'soluble', 'polymer', 'polyamino', 'acid']


In [72]:
('  '.join(data(words_lemmatized)))

TypeError: 'DataFrame' object is not callable