# How to perform simple text preprocessing 

In [1]:
import nltk

In [2]:
sample_text = "A nuclear power plant is a thermal power station in which the heat source is a nuclear reactor. \
As is typical of thermal power stations, heat is usssded to generate steam that drives a steam turbine connected to aaaaa generator that produces electricity. \
As of 2018, the International Atomic Enertgy Agency reported ther were 450 nuclear power reactors in operation in 30 countries."

# Tokenization

In [3]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
sentences = sent_tokenize(sample_text)
sentences

['A nuclear power plant is a thermal power station in which the heat source is a nuclear reactor.',
 'As is typical of thermal power stations, heat is usssded to generate steam that drives a steam turbine connected to aaaaa generator that produces electricity.',
 'As of 2018, the International Atomic Enertgy Agency reported ther were 450 nuclear power reactors in operation in 30 countries.']

In [5]:
word_tokens = word_tokenize(sample_text)
word_tokens

['A',
 'nuclear',
 'power',
 'plant',
 'is',
 'a',
 'thermal',
 'power',
 'station',
 'in',
 'which',
 'the',
 'heat',
 'source',
 'is',
 'a',
 'nuclear',
 'reactor',
 '.',
 'As',
 'is',
 'typical',
 'of',
 'thermal',
 'power',
 'stations',
 ',',
 'heat',
 'is',
 'usssded',
 'to',
 'generate',
 'steam',
 'that',
 'drives',
 'a',
 'steam',
 'turbine',
 'connected',
 'to',
 'aaaaa',
 'generator',
 'that',
 'produces',
 'electricity',
 '.',
 'As',
 'of',
 '2018',
 ',',
 'the',
 'International',
 'Atomic',
 'Enertgy',
 'Agency',
 'reported',
 'ther',
 'were',
 '450',
 'nuclear',
 'power',
 'reactors',
 'in',
 'operation',
 'in',
 '30',
 'countries',
 '.']

In [6]:
import string

# Remove punctuations' 

In [7]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
no_punc_tokens = [w for w in word_tokens if not w in string.punctuation] 

print(word_tokens) 
print(no_punc_tokens)

['A', 'nuclear', 'power', 'plant', 'is', 'a', 'thermal', 'power', 'station', 'in', 'which', 'the', 'heat', 'source', 'is', 'a', 'nuclear', 'reactor', '.', 'As', 'is', 'typical', 'of', 'thermal', 'power', 'stations', ',', 'heat', 'is', 'usssded', 'to', 'generate', 'steam', 'that', 'drives', 'a', 'steam', 'turbine', 'connected', 'to', 'aaaaa', 'generator', 'that', 'produces', 'electricity', '.', 'As', 'of', '2018', ',', 'the', 'International', 'Atomic', 'Enertgy', 'Agency', 'reported', 'ther', 'were', '450', 'nuclear', 'power', 'reactors', 'in', 'operation', 'in', '30', 'countries', '.']
['A', 'nuclear', 'power', 'plant', 'is', 'a', 'thermal', 'power', 'station', 'in', 'which', 'the', 'heat', 'source', 'is', 'a', 'nuclear', 'reactor', 'As', 'is', 'typical', 'of', 'thermal', 'power', 'stations', 'heat', 'is', 'usssded', 'to', 'generate', 'steam', 'that', 'drives', 'a', 'steam', 'turbine', 'connected', 'to', 'aaaaa', 'generator', 'that', 'produces', 'electricity', 'As', 'of', '2018', 'the'

# Lowercase

In [9]:
lower_words = [w.lower() for w in no_punc_tokens]
lower_words

['a',
 'nuclear',
 'power',
 'plant',
 'is',
 'a',
 'thermal',
 'power',
 'station',
 'in',
 'which',
 'the',
 'heat',
 'source',
 'is',
 'a',
 'nuclear',
 'reactor',
 'as',
 'is',
 'typical',
 'of',
 'thermal',
 'power',
 'stations',
 'heat',
 'is',
 'usssded',
 'to',
 'generate',
 'steam',
 'that',
 'drives',
 'a',
 'steam',
 'turbine',
 'connected',
 'to',
 'aaaaa',
 'generator',
 'that',
 'produces',
 'electricity',
 'as',
 'of',
 '2018',
 'the',
 'international',
 'atomic',
 'enertgy',
 'agency',
 'reported',
 'ther',
 'were',
 '450',
 'nuclear',
 'power',
 'reactors',
 'in',
 'operation',
 'in',
 '30',
 'countries']

# Remove stop words

In [10]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [11]:
filtered_tokens = [w for w in lower_words if not w in stop_words] 

print(word_tokens) 
print(filtered_tokens)

['A', 'nuclear', 'power', 'plant', 'is', 'a', 'thermal', 'power', 'station', 'in', 'which', 'the', 'heat', 'source', 'is', 'a', 'nuclear', 'reactor', '.', 'As', 'is', 'typical', 'of', 'thermal', 'power', 'stations', ',', 'heat', 'is', 'usssded', 'to', 'generate', 'steam', 'that', 'drives', 'a', 'steam', 'turbine', 'connected', 'to', 'aaaaa', 'generator', 'that', 'produces', 'electricity', '.', 'As', 'of', '2018', ',', 'the', 'International', 'Atomic', 'Enertgy', 'Agency', 'reported', 'ther', 'were', '450', 'nuclear', 'power', 'reactors', 'in', 'operation', 'in', '30', 'countries', '.']
['nuclear', 'power', 'plant', 'thermal', 'power', 'station', 'heat', 'source', 'nuclear', 'reactor', 'typical', 'thermal', 'power', 'stations', 'heat', 'usssded', 'generate', 'steam', 'drives', 'steam', 'turbine', 'connected', 'aaaaa', 'generator', 'produces', 'electricity', '2018', 'international', 'atomic', 'enertgy', 'agency', 'reported', 'ther', '450', 'nuclear', 'power', 'reactors', 'operation', '30

# Stemming and Lemmatization

In [12]:
nltk.download('wordnet') 

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [13]:
stemmer_ps = PorterStemmer()  
stemmed_words_ps = [stemmer_ps.stem(word) for word in filtered_tokens]
print("Words: ", filtered_tokens)
print("Porter stemmed words: ", stemmed_words_ps)


Words:  ['nuclear', 'power', 'plant', 'thermal', 'power', 'station', 'heat', 'source', 'nuclear', 'reactor', 'typical', 'thermal', 'power', 'stations', 'heat', 'usssded', 'generate', 'steam', 'drives', 'steam', 'turbine', 'connected', 'aaaaa', 'generator', 'produces', 'electricity', '2018', 'international', 'atomic', 'enertgy', 'agency', 'reported', 'ther', '450', 'nuclear', 'power', 'reactors', 'operation', '30', 'countries']
Porter stemmed words:  ['nuclear', 'power', 'plant', 'thermal', 'power', 'station', 'heat', 'sourc', 'nuclear', 'reactor', 'typic', 'thermal', 'power', 'station', 'heat', 'usssd', 'gener', 'steam', 'drive', 'steam', 'turbin', 'connect', 'aaaaa', 'gener', 'produc', 'electr', '2018', 'intern', 'atom', 'enertgi', 'agenc', 'report', 'ther', '450', 'nuclear', 'power', 'reactor', 'oper', '30', 'countri']


In [14]:
lemmatizer = WordNetLemmatizer()  
lemmatize_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Words: ", filtered_tokens)
print("Lemmatized words: ", lemmatize_words)

Words:  ['nuclear', 'power', 'plant', 'thermal', 'power', 'station', 'heat', 'source', 'nuclear', 'reactor', 'typical', 'thermal', 'power', 'stations', 'heat', 'usssded', 'generate', 'steam', 'drives', 'steam', 'turbine', 'connected', 'aaaaa', 'generator', 'produces', 'electricity', '2018', 'international', 'atomic', 'enertgy', 'agency', 'reported', 'ther', '450', 'nuclear', 'power', 'reactors', 'operation', '30', 'countries']
Lemmatized words:  ['nuclear', 'power', 'plant', 'thermal', 'power', 'station', 'heat', 'source', 'nuclear', 'reactor', 'typical', 'thermal', 'power', 'station', 'heat', 'usssded', 'generate', 'steam', 'drive', 'steam', 'turbine', 'connected', 'aaaaa', 'generator', 'produce', 'electricity', '2018', 'international', 'atomic', 'enertgy', 'agency', 'reported', 'ther', '450', 'nuclear', 'power', 'reactor', 'operation', '30', 'country']


In [15]:
nltk.stem.WordNetLemmatizer().lemmatize('loving','v')

'love'