In [20]:
import pandas as pd      
from bs4 import BeautifulSoup
import re
import spacy
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
import nltk
from nltk.corpus import stopwords
from spacy.lang.en import English

import string
%matplotlib inline
from tqdm.notebook import tqdm
tqdm.pandas()

In [21]:
file_path = "./data/outputs/autonomous_clean.pkl"
df = pd.read_pickle(file_path)

We need to clean text. Since text is on web (HTML) format, we can use BeautifulSoup to parse it 

In [22]:
def soup(text):
    text = BeautifulSoup(text, "html5lib").get_text()
    return text
df['text'] = df['text'].apply(soup)

We need to tokenize the text, meaning breaking up each sentence to words, and eliminating ends of each word such as -ed, -ize, -ing’s.

Patents have certain patent-specific languages that can be repetitive. Such language can interfere with the result, so we want to remove them. We also don’t want words that are so unique that it shows up only once in more than a thousand abstracts. To get only the words that would help my analysis, I entered words to take out (stopwords) and used Regex.


In [23]:
nltk.download('stopwords')
stop_list = [
'claims','claim', 'method', 'provide', 'provided', \
'device', 'devices','apparatus','system', 'systems', \
'apparatuses', 'embodiments', 'embodiment','examples', \
'example','inventions', 'invention', 'present', \
'includes', 'include', 'including','description', \
'user', 'body', 'power', 'person', 'persons', \
'comprising', 'comprise', 'comprises', 'configured', \
'configure','for example', 'discloses', 'disclose', \
'method', 'said', 'abstract', 'abstracts', 'disclosed', 'herein', \
'autonomous', 'vehicle', 'self-driving', 'sensor'
]

punctuation = list(string.punctuation)

#larger list containing all custom stop words as well as from NLP libraries
stop = set(list(stop_list) + list(ENGLISH_STOP_WORDS) \
           + stopwords.words('english') + punctuation)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
nlp = spacy.load('en_core_web_sm')

Regex is a programming tool that lets us find certain expressions and apply a global change to text, sort of like a sophisticated control+F change. For example, if we want to remove all words before the word “embodiment” from a sentence, we can write a regex code to identify all sentences with “embodiment,” and remove the word “embodiment” plus all words between “embodiment” and the period immediately before the sentence.

In [26]:
def tokenizeText(text):
    #clean text using regex
    separators = ["\xa0\xa0\xa0\xa0", "\r", "\n",\
                  "\t", "'m", "'ll", '^\d+\s|\s\d+\s|\s\d+$']
    for i in separators:
        text = re.sub(i, " ", text.lower())
    # Use Spacy to parse text    
    doc = nlp(text)
    # Lemmatization
    tokens = [token.lemma_ for token in doc]
    # Remove stop words
    tokens = [tok for tok in tokens if len(tok) != 1 and tok not in stop]
    return tokens

def text_processing(corp):
    corp = tokenizeText(corp)
    return ' '.join(corp)

Apply text processing to every row in Dataframe

In [27]:
df['text'] = df['text'].progress_apply(text_processing)

  0%|          | 0/50 [00:00<?, ?it/s]

In [30]:
df['text'].sample(5)

priority date
2017-03-20    assembly module external surface receive seal ...
2018-07-19    av determine datum road geometry datum plurali...
2016-03-15    drive behavior modification receive passenger ...
2014-03-18    increase safety comfort driving drive arrangem...
2012-03-15    operate mode determine current state current s...
Name: text, dtype: object

In [161]:
#! python -m spacy download en

In [31]:
output_filepath = "./data/outputs/autonomous_tokenized.pkl"
df.to_pickle(output_filepath)