### Tokenization ###

In [1]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\suzum\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
words = "I like to read books"
tokenized = word_tokenize(words)

### POS Tagging ###

In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\suzum\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
nltk.pos_tag(tokenized)

[('I', 'PRP'), ('like', 'VBP'), ('to', 'TO'), ('read', 'VB'), ('books', 'NNS')]

### Stopwords ###

In [5]:
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words("English")
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suzum\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
tokenized_no_stops = " ".join([word for word in tokenized if word not in stop_words])
print(tokenized_no_stops)

I like read books


### Normalization ###

In [7]:
sentence1 = "I visited US from UK on 22-10-18"
normalized_sentence = sentence1.replace("US", "United States").replace("UK", "United Kingdom").replace("-18", "-2018")
print(normalized_sentence)

I visited United States from United Kingdom on 22-10-2018


### Spelling Correction for a Word (Normalization) ###

In [8]:
from autocorrect import spell

In [9]:
spell("Natureal")

autocorrect.spell is deprecated, use autocorrect.Speller instead


'Natural'

### Spelling Correction for a Sentence (Normalization) ###

In [10]:
sentence2 = "Ntural Luanguage Processin deals with the art of extracting insightes from Natural Languaes"
tokenized_sentence = word_tokenize(sentence2)
print(tokenized_sentence)

['Ntural', 'Luanguage', 'Processin', 'deals', 'with', 'the', 'art', 'of', 'extracting', 'insightes', 'from', 'Natural', 'Languaes']


In [11]:
tokenized_sentence_corrected = " ".join([spell(word) for word in tokenized_sentence])
print(tokenized_sentence_corrected)

autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
autocorrect.spell is deprecated, use autocorrect.Speller instead
Natural language procession deals with the art of extracting insighted from Natural languages


### Stemming (Normalization) ###

In [12]:
stemmer = nltk.stem.PorterStemmer()
print(stemmer.stem("production"))
print(stemmer.stem("coming"))
print(stemmer.stem("firing"))
print(stemmer.stem("battling"))

product
come
fire
battl


### Lemmatization (Normalization) ###

In [13]:
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\suzum\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("products"))
print(lemmatizer.lemmatize("production"))
print(lemmatizer.lemmatize("coming"))
print(lemmatizer.lemmatize("battle"))

product
production
coming
battle


### Named Entity Recognition ###

In [15]:
nltk.download("maxent_ne_chunker")
nltk.download("words")

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\suzum\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\suzum\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [16]:
sentence = "We will read a tragedy of Shakespeare, and author from Stratford-on-Avon"

In [17]:
i = nltk.ne_chunk(nltk.pos_tag(word_tokenize(sentence)), binary=True)
[a for a in i if len(a)==1]


[Tree('NE', [('Shakespeare', 'NNP')])]