In [15]:
import nltk
nltk.data.path.append("../local_packages/nltk_data")
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re

In [4]:
patents = pd.read_pickle('./data/Patent_Dataset.pkl')

In [5]:
# our index is incorrect, quickly reset it
patents = patents.reset_index(drop=True)

patents.head()

Unnamed: 0,patent_id,abstract,classifications_cpc,publication_date
0,US06841689-20050111,There is provided a process for the addition o...,[],2005-01-11
1,US06895631-20050524,A buffing pad that includes a substantially co...,[],2005-05-24
2,US06974769-20051213,Conductive structures in features of an insula...,[],2005-12-13
3,US07006202-20060228,A mask holder for irradiating UV-rays is discl...,"[G03B27/42, G03B27/58, G03B27/62]",2006-02-28
4,US07096910-20060829,A tyre for a vehicle wheel includes a carcass ...,"[B29D30/00, B60C9/00, B60C9/02]",2006-08-29


In [6]:
# Viewing the text before we lowercase it
# using the column "abstract" from the data frame and
# observing the first row in that column (index 0)
patents["abstract"][0]

'There is provided a process for the addition of a nucleophile across an electron poor carbon-carbon double bond (a Michael addition) comprising contacting in a solvent: i) a nucleophile; ii) a compound comprising an electron poor double bond; and iii) a catalyst comprising a soluble polymer and a polyamino acid.'

In [7]:
# lowercase the text using pandas string methods
patents['abstract_lower'] = patents['abstract'].str.lower()

# viewing the text after lowercasing
patents["abstract_lower"][0]

'there is provided a process for the addition of a nucleophile across an electron poor carbon-carbon double bond (a michael addition) comprising contacting in a solvent: i) a nucleophile; ii) a compound comprising an electron poor double bond; and iii) a catalyst comprising a soluble polymer and a polyamino acid.'

In [8]:
# display the standard string punctuation
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
# we need to create a regular expression (covered in the previous chapter)
# which captures all the above punctuation characters
"[{}]".format(string.punctuation)

'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [11]:
# Below is a function that uses regex to remove punctuation from strings
def remove_punct(ptext):
    # replace any punctuation with nothing "", effectively removing it
    ptext = re.sub(string=ptext,
                   pattern="[{}]".format(string.punctuation), 
                   repl="")
    return ptext

# by making a function that works for one piece of text
# we can then apply the function to all the pandas text

# viewing our text before removing punctuation
patents["abstract"][0]

'There is provided a process for the addition of a nucleophile across an electron poor carbon-carbon double bond (a Michael addition) comprising contacting in a solvent: i) a nucleophile; ii) a compound comprising an electron poor double bond; and iii) a catalyst comprising a soluble polymer and a polyamino acid.'

In [12]:
patents['abstract_no_punct'] = patents['abstract'].apply(remove_punct)

patents['abstract_no_punct'][0]

'There is provided a process for the addition of a nucleophile across an electron poor carboncarbon double bond a Michael addition comprising contacting in a solvent i a nucleophile ii a compound comprising an electron poor double bond and iii a catalyst comprising a soluble polymer and a polyamino acid'

In [13]:
# Basic tokenization example
original_text = "This is the text given"

# Create tokens by splitting the text on each " " space
tokens = original_text.split(" ")

print("Original text:\n\t", original_text)
print("Original text 'type':\n\t", type(original_text).__name__)
print("Tokenized text:\n\t", tokens)
print("Tokenized text 'type':\n\t", type(tokens).__name__)

Original text:
	 This is the text given
Original text 'type':
	 str
Tokenized text:
	 ['This', 'is', 'the', 'text', 'given']
Tokenized text 'type':
	 list


In [17]:
# Download the 'punkt' tokenizer data
nltk.download('punkt')

# Now you can use the word_tokenize function
from nltk.tokenize import word_tokenize

nltk_tokens = nltk.word_tokenize(original_text)
nltk_tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joyces\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['This', 'is', 'the', 'text', 'given']

In [18]:
# Apply to 'abstract' column in dataframe
patents['abstract_tokens'] = patents['abstract'].apply(nltk.word_tokenize)

patents["abstract_tokens"][0]

['There',
 'is',
 'provided',
 'a',
 'process',
 'for',
 'the',
 'addition',
 'of',
 'a',
 'nucleophile',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'a',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophile',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'comprising',
 'an',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'comprising',
 'a',
 'soluble',
 'polymer',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [19]:
# Apply tokenisation to abstract column in dataframe
patents['abstract_sentences'] = patents['abstract'].apply(nltk.sent_tokenize)

# text before sentence segmentation
patents["abstract"][0]

'There is provided a process for the addition of a nucleophile across an electron poor carbon-carbon double bond (a Michael addition) comprising contacting in a solvent: i) a nucleophile; ii) a compound comprising an electron poor double bond; and iii) a catalyst comprising a soluble polymer and a polyamino acid.'

In [20]:
# text after sentence segmentation
patents["abstract_sentences"][0]

['There is provided a process for the addition of a nucleophile across an electron poor carbon-carbon double bond (a Michael addition) comprising contacting in a solvent: i) a nucleophile; ii) a compound comprising an electron poor double bond; and iii) a catalyst comprising a soluble polymer and a polyamino acid.']

In [21]:
patents["abstract_sentences"][1]

['A buffing pad that includes a substantially conventional buffing pad made of tufted wool that includes a backing plate and a central hub for attachment to the shaft of a rotary power buffer.',
 'The tufts or strands of wool are of a first and substantially equal length and of a substantially uniform color.',
 'The pad also carries a plurality of shorter tufts of wool or other fiber of a contrasting color.',
 'When the pad is relatively new, the shorter tufts are not normally visible as they are hidden by the longer tufts.',
 'As the pad becomes worn, however, the longer tufts become shorter and matted down thereby exposing the shorter tufts.',
 'The appearance of the contrasting color of the shorter tufts is an indicator that the pad is worn and should be replaced.']

In [22]:
# Using the PorterStemmer from nltk
words_to_normalise = "The Ones and twos argues who is winning in todays matches"

# generate tokens
tokens = nltk.word_tokenize(words_to_normalise)

# note we have not applied any other preprocessing to the text
tokens

['The',
 'Ones',
 'and',
 'twos',
 'argues',
 'who',
 'is',
 'winning',
 'in',
 'todays',
 'matches']

In [23]:
# Loop through each token in the list and apply the stemming
tokens_stemmed = [PorterStemmer().stem(token) for token in tokens]

# A list of stemmed words
print(tokens_stemmed)
# The has clearly applied some processing to the text, beyond chopping
# off an ending. This PorterStemmer has also lowercased the text

['the', 'one', 'and', 'two', 'argu', 'who', 'is', 'win', 'in', 'today', 'match']


In [24]:
## Applying stemming to the pandas data
# Define stemming function

def stemming(ptoken):
    # create stemming object
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in ptoken]  

# tokens pre-stemming
patents['abstract_tokens'][0]

['There',
 'is',
 'provided',
 'a',
 'process',
 'for',
 'the',
 'addition',
 'of',
 'a',
 'nucleophile',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'a',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophile',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'comprising',
 'an',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'comprising',
 'a',
 'soluble',
 'polymer',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [25]:
patents['abstract_tokens_stemmed'] = patents['abstract_tokens'].apply(stemming)

# tokens post stemming
patents['abstract_tokens_stemmed'][0]

['there',
 'is',
 'provid',
 'a',
 'process',
 'for',
 'the',
 'addit',
 'of',
 'a',
 'nucleophil',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'doubl',
 'bond',
 '(',
 'a',
 'michael',
 'addit',
 ')',
 'compris',
 'contact',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophil',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'compris',
 'an',
 'electron',
 'poor',
 'doubl',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'compris',
 'a',
 'solubl',
 'polym',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [26]:
# comparing the pre and post stemmed tokens
list(zip(patents['abstract_tokens'][0], patents['abstract_tokens_stemmed'][0]))

[('There', 'there'),
 ('is', 'is'),
 ('provided', 'provid'),
 ('a', 'a'),
 ('process', 'process'),
 ('for', 'for'),
 ('the', 'the'),
 ('addition', 'addit'),
 ('of', 'of'),
 ('a', 'a'),
 ('nucleophile', 'nucleophil'),
 ('across', 'across'),
 ('an', 'an'),
 ('electron', 'electron'),
 ('poor', 'poor'),
 ('carbon-carbon', 'carbon-carbon'),
 ('double', 'doubl'),
 ('bond', 'bond'),
 ('(', '('),
 ('a', 'a'),
 ('Michael', 'michael'),
 ('addition', 'addit'),
 (')', ')'),
 ('comprising', 'compris'),
 ('contacting', 'contact'),
 ('in', 'in'),
 ('a', 'a'),
 ('solvent', 'solvent'),
 (':', ':'),
 ('i', 'i'),
 (')', ')'),
 ('a', 'a'),
 ('nucleophile', 'nucleophil'),
 (';', ';'),
 ('ii', 'ii'),
 (')', ')'),
 ('a', 'a'),
 ('compound', 'compound'),
 ('comprising', 'compris'),
 ('an', 'an'),
 ('electron', 'electron'),
 ('poor', 'poor'),
 ('double', 'doubl'),
 ('bond', 'bond'),
 (';', ';'),
 ('and', 'and'),
 ('iii', 'iii'),
 (')', ')'),
 ('a', 'a'),
 ('catalyst', 'catalyst'),
 ('comprising', 'compris'),
 

In [27]:
# Using the WordNetLemmatizer from nltk
# without using the parts of speech tagging it doesn't perform incredibly well
# But it does always result in valid words where it runs

# Run tokenizer
tokens = nltk.word_tokenize(words_to_normalise)
print(words_to_normalise)
tokens

The Ones and twos argues who is winning in todays matches


['The',
 'Ones',
 'and',
 'twos',
 'argues',
 'who',
 'is',
 'winning',
 'in',
 'todays',
 'matches']

In [29]:
from nltk.stem import WordNetLemmatizer

# Download the 'wordnet' corpus
nltk.download('wordnet')

# Now you can use the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

tokens_lemmed = [WordNetLemmatizer().lemmatize(token) for token in tokens]
print(tokens_lemmed)
type(tokens_lemmed)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joyces\AppData\Roaming\nltk_data...


['The', 'Ones', 'and', 'two', 'argues', 'who', 'is', 'winning', 'in', 'today', 'match']


list

In [30]:
# Define the lemmatize() function

def lemmatise(ptokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in ptokens]

# tokens before lemmatization
patents["abstract_tokens"][0]

['There',
 'is',
 'provided',
 'a',
 'process',
 'for',
 'the',
 'addition',
 'of',
 'a',
 'nucleophile',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'a',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophile',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'comprising',
 'an',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'comprising',
 'a',
 'soluble',
 'polymer',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [31]:
# apply lemmatisation to all tokens in column
patents['abstract_tokens_lemmatised'] = patents['abstract_tokens'].apply(lemmatise)

# tokens after lemmatization
patents["abstract_tokens_lemmatised"][0]

['There',
 'is',
 'provided',
 'a',
 'process',
 'for',
 'the',
 'addition',
 'of',
 'a',
 'nucleophile',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'a',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophile',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'comprising',
 'an',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'comprising',
 'a',
 'soluble',
 'polymer',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [32]:
# Comparison of normalisation
list(zip(patents["abstract_tokens"][0], patents["abstract_tokens_lemmatised"][0]))

[('There', 'There'),
 ('is', 'is'),
 ('provided', 'provided'),
 ('a', 'a'),
 ('process', 'process'),
 ('for', 'for'),
 ('the', 'the'),
 ('addition', 'addition'),
 ('of', 'of'),
 ('a', 'a'),
 ('nucleophile', 'nucleophile'),
 ('across', 'across'),
 ('an', 'an'),
 ('electron', 'electron'),
 ('poor', 'poor'),
 ('carbon-carbon', 'carbon-carbon'),
 ('double', 'double'),
 ('bond', 'bond'),
 ('(', '('),
 ('a', 'a'),
 ('Michael', 'Michael'),
 ('addition', 'addition'),
 (')', ')'),
 ('comprising', 'comprising'),
 ('contacting', 'contacting'),
 ('in', 'in'),
 ('a', 'a'),
 ('solvent', 'solvent'),
 (':', ':'),
 ('i', 'i'),
 (')', ')'),
 ('a', 'a'),
 ('nucleophile', 'nucleophile'),
 (';', ';'),
 ('ii', 'ii'),
 (')', ')'),
 ('a', 'a'),
 ('compound', 'compound'),
 ('comprising', 'comprising'),
 ('an', 'an'),
 ('electron', 'electron'),
 ('poor', 'poor'),
 ('double', 'double'),
 ('bond', 'bond'),
 (';', ';'),
 ('and', 'and'),
 ('iii', 'iii'),
 (')', ')'),
 ('a', 'a'),
 ('catalyst', 'catalyst'),
 ('compr

In [34]:
from nltk.corpus import stopwords

# Download the 'stopwords' corpus
nltk.download('stopwords')

# Display the basic stopwords given by nltk
print(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joyces\AppData\Roaming\nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Unzipping corpora\stopwords.zip.


In [35]:
stop_words_french = set(stopwords.words('french'))
print(stop_words_french)

{'et', 'un', 'seraient', 'avais', 'sommes', 'ai', 'me', 'aurions', 'aurai', 'eusses', 'ces', 'serons', 'à', 'ayante', 'serez', 'eux', 'ton', 'étaient', 'moi', 'sa', 'pas', 'étée', 'eu', 'étions', 'es', 'avons', 'seriez', 'aies', 'étante', 'par', 'avec', 'qui', 'eue', 'auriez', 'étés', 'serions', 'ayants', 'auras', 't', 'eussions', 'nos', 'ayons', 'eurent', 'ayantes', 'pour', 'étais', 'fusses', 'ma', 'est', 'eût', 'on', 'mais', 'qu', 'se', 'as', 'tes', 'était', 'soyez', 'serais', 'ils', 'l', 'eut', 'suis', 'une', 'ta', 'ne', 'étants', 'seront', 'furent', 'aie', 'eussent', 'aurons', 'fussions', 'aurait', 'fusse', 'son', 'soyons', 'te', 'd', 'fûmes', 'fut', 'ses', 'tu', 'avait', 's', 'votre', 'eus', 'avions', 'étées', 'que', 'le', 'eusse', 'eussiez', 'serait', 'avaient', 'les', 'ce', 'sois', 'fussiez', 'nous', 'c', 'serai', 'il', 'avez', 'ayant', 'aurez', 'toi', 'j', 'ayez', 'soient', 'même', 'vous', 'je', 'mon', 'notre', 'm', 'fussent', 'sur', 'du', 'elle', 'eûtes', 'aviez', 'mes', 'sera

In [36]:
del stop_words_french
# let's ensure we are using English stopwords for this project
stop_words = set(stopwords.words('english'))

In [37]:
# Define a function to remove stopwords from list of tokens
def clean_stopwords(tokens):
    # define stopwords
    stop_words = set(stopwords.words('english'))
    # loop through each token and if the word isn't in the set 
    # of stopwords keep it
    return [item for item in tokens if item not in stop_words]

In [38]:
# Pre stopword removal
patents['abstract_tokens'][0]

['There',
 'is',
 'provided',
 'a',
 'process',
 'for',
 'the',
 'addition',
 'of',
 'a',
 'nucleophile',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'a',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophile',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'comprising',
 'an',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'comprising',
 'a',
 'soluble',
 'polymer',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [39]:
patents['tokens_no_stops'] = patents['abstract_tokens'].apply(clean_stopwords)

# Post stopword removal
patents['tokens_no_stops'][0]

['There',
 'provided',
 'process',
 'addition',
 'nucleophile',
 'across',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'solvent',
 ':',
 ')',
 'nucleophile',
 ';',
 'ii',
 ')',
 'compound',
 'comprising',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'iii',
 ')',
 'catalyst',
 'comprising',
 'soluble',
 'polymer',
 'polyamino',
 'acid',
 '.']

In [40]:
# These were the stopwords that were removed
found_stopwords = []

# go through each unique token
for token in set(patents['abstract_tokens'][0]):
    if token in stop_words:
        found_stopwords.append(token)
        
found_stopwords

['in', 'is', 'a', 'for', 'an', 'the', 'and', 'i', 'of']

In [41]:
# lets look at a new text, row 21, which contains some digits
patents["abstract"][21]

'To provide a method for measuring a plane mirror or a curved surface mirror close to plane mirror for condensing hard X-rays or soft X-rays used in a radiation light facility, especially an elliptical or tubular object having a steep profile exceeding 1×10 −4  rad, ultra precisely with a precision on nano order or sub-nano order. Overall profile is measured by using overall profile data obtained from a Fizeau interferometer and stitching a plurality of micromeasurement data from a Michelson microinterferometer. A curved surface measured and a reference plane are measured simultaneously by the Fizeau interferometer, a plurality of pieces of partial profile data in a region narrower than the curved surface measured are acquired simultaneously by inclining the curved surface measured and the reference plane simultaneously and sequentially with respect to a reference plane, relative angle between the pieces of partial profile data is measured as the inclination angle of the reference plan

In [43]:
def remove_num(ptokens):
    return [token for token in ptokens if token.isalpha()]

# Apply removal function
patents['abstract_no_nums'] = patents['abstract_tokens'].apply(remove_num)

# ensure you only use remove_num on tokenized text, otherwise it tokenizes every character.
patents["abstract_tokens"][21]

['To',
 'provide',
 'a',
 'method',
 'for',
 'measuring',
 'a',
 'plane',
 'mirror',
 'or',
 'a',
 'curved',
 'surface',
 'mirror',
 'close',
 'to',
 'plane',
 'mirror',
 'for',
 'condensing',
 'hard',
 'X-rays',
 'or',
 'soft',
 'X-rays',
 'used',
 'in',
 'a',
 'radiation',
 'light',
 'facility',
 ',',
 'especially',
 'an',
 'elliptical',
 'or',
 'tubular',
 'object',
 'having',
 'a',
 'steep',
 'profile',
 'exceeding',
 '1×10',
 '−4',
 'rad',
 ',',
 'ultra',
 'precisely',
 'with',
 'a',
 'precision',
 'on',
 'nano',
 'order',
 'or',
 'sub-nano',
 'order',
 '.',
 'Overall',
 'profile',
 'is',
 'measured',
 'by',
 'using',
 'overall',
 'profile',
 'data',
 'obtained',
 'from',
 'a',
 'Fizeau',
 'interferometer',
 'and',
 'stitching',
 'a',
 'plurality',
 'of',
 'micromeasurement',
 'data',
 'from',
 'a',
 'Michelson',
 'microinterferometer',
 '.',
 'A',
 'curved',
 'surface',
 'measured',
 'and',
 'a',
 'reference',
 'plane',
 'are',
 'measured',
 'simultaneously',
 'by',
 'the',
 'Fiz

In [44]:
# We can see there are no longer any tokens with digits in it
# "1x10", "-4" have been removed
patents["abstract_no_nums"][21]

['To',
 'provide',
 'a',
 'method',
 'for',
 'measuring',
 'a',
 'plane',
 'mirror',
 'or',
 'a',
 'curved',
 'surface',
 'mirror',
 'close',
 'to',
 'plane',
 'mirror',
 'for',
 'condensing',
 'hard',
 'or',
 'soft',
 'used',
 'in',
 'a',
 'radiation',
 'light',
 'facility',
 'especially',
 'an',
 'elliptical',
 'or',
 'tubular',
 'object',
 'having',
 'a',
 'steep',
 'profile',
 'exceeding',
 'rad',
 'ultra',
 'precisely',
 'with',
 'a',
 'precision',
 'on',
 'nano',
 'order',
 'or',
 'order',
 'Overall',
 'profile',
 'is',
 'measured',
 'by',
 'using',
 'overall',
 'profile',
 'data',
 'obtained',
 'from',
 'a',
 'Fizeau',
 'interferometer',
 'and',
 'stitching',
 'a',
 'plurality',
 'of',
 'micromeasurement',
 'data',
 'from',
 'a',
 'Michelson',
 'microinterferometer',
 'A',
 'curved',
 'surface',
 'measured',
 'and',
 'a',
 'reference',
 'plane',
 'are',
 'measured',
 'simultaneously',
 'by',
 'the',
 'Fizeau',
 'interferometer',
 'a',
 'plurality',
 'of',
 'pieces',
 'of',
 'par

In [45]:
def remove_short_tokens(ptokens):
    return [token for token in ptokens if len(token) > 2]

# pre short word removal
patents["abstract_tokens"][0]

['There',
 'is',
 'provided',
 'a',
 'process',
 'for',
 'the',
 'addition',
 'of',
 'a',
 'nucleophile',
 'across',
 'an',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 '(',
 'a',
 'Michael',
 'addition',
 ')',
 'comprising',
 'contacting',
 'in',
 'a',
 'solvent',
 ':',
 'i',
 ')',
 'a',
 'nucleophile',
 ';',
 'ii',
 ')',
 'a',
 'compound',
 'comprising',
 'an',
 'electron',
 'poor',
 'double',
 'bond',
 ';',
 'and',
 'iii',
 ')',
 'a',
 'catalyst',
 'comprising',
 'a',
 'soluble',
 'polymer',
 'and',
 'a',
 'polyamino',
 'acid',
 '.']

In [46]:
# use the short word removal
patents['abstract_no_small'] = patents['abstract_tokens'].apply(remove_short_tokens)

# after removal
patents["abstract_no_small"][0]

['There',
 'provided',
 'process',
 'for',
 'the',
 'addition',
 'nucleophile',
 'across',
 'electron',
 'poor',
 'carbon-carbon',
 'double',
 'bond',
 'Michael',
 'addition',
 'comprising',
 'contacting',
 'solvent',
 'nucleophile',
 'compound',
 'comprising',
 'electron',
 'poor',
 'double',
 'bond',
 'and',
 'iii',
 'catalyst',
 'comprising',
 'soluble',
 'polymer',
 'and',
 'polyamino',
 'acid']

In [47]:
def preprocessing_with_lemmatisation(raw_data):
    """Function to perform all preprocessing steps with lemmatisation"""
    ptext = raw_data.lower()
    ptext = remove_punct(ptext)
    ptext = nltk.word_tokenize(ptext)
    ptext = lemmatise(ptext)
    ptext = remove_num(ptext)
    ptext = clean_stopwords(ptext)
    ptext = remove_short_tokens(ptext)

    return ptext

def preprocessing_with_stemming(raw_data):
    """Function to perform all preprocessing steps with stemming"""
    ptext = raw_data.lower()
    ptext = remove_punct(ptext)
    ptext = nltk.word_tokenize(ptext)
    ptext = stemming(ptext)
    ptext = remove_num(ptext)
    ptext = clean_stopwords(ptext)
    ptext = remove_short_tokens(ptext)
        
    return ptext

In [48]:
# Perform all processing at once
patents['processed_with_lem'] = patents['abstract'].apply(preprocessing_with_lemmatisation)

patents["processed_with_lem"][0]

['provided',
 'process',
 'addition',
 'nucleophile',
 'across',
 'electron',
 'poor',
 'carboncarbon',
 'double',
 'bond',
 'michael',
 'addition',
 'comprising',
 'contacting',
 'solvent',
 'nucleophile',
 'compound',
 'comprising',
 'electron',
 'poor',
 'double',
 'bond',
 'iii',
 'catalyst',
 'comprising',
 'soluble',
 'polymer',
 'polyamino',
 'acid']

In [49]:
patents['processed_with_stem'] = patents['abstract'].apply(preprocessing_with_stemming)

patents["processed_with_stem"][0]

['provid',
 'process',
 'addit',
 'nucleophil',
 'across',
 'electron',
 'poor',
 'carboncarbon',
 'doubl',
 'bond',
 'michael',
 'addit',
 'compris',
 'contact',
 'solvent',
 'nucleophil',
 'compound',
 'compris',
 'electron',
 'poor',
 'doubl',
 'bond',
 'iii',
 'catalyst',
 'compris',
 'solubl',
 'polym',
 'polyamino',
 'acid']