In [None]:
'''
NLP Pipeline
    Sentence segmentation
    Word tokenization
    Stemming
    Lemmatization
    Stop words removal
    POS tagging
    Name Entity Recognition (NER)
    Chunking
'''

Sentence Tokenization

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    return sentences
text = input("Enter the text: ")
sentences = tokenize_sentences(text)
# print sentences
for i,sentence in enumerate (sentences):
    print(f"{i+1}. {sentence}")

Enter the text: One of the tell-tale signs of cheating on your Spanish homework is that grammatically, it’s a mess. Many languages don’t allow for straight translation and have different orders for sentence structure, which translation services used to overlook. But, they’ve come a long way. With NLP, online translators can translate languages more accurately and present grammatically-correct results. This is infinitely helpful when trying to communicate with someone in another language. Not only that, but when translating from another language to your own, tools now recognize the language based on inputted text and translate it.
1. One of the tell-tale signs of cheating on your Spanish homework is that grammatically, it’s a mess.
2. Many languages don’t allow for straight translation and have different orders for sentence structure, which translation services used to overlook.
3. But, they’ve come a long way.
4. With NLP, online translators can translate languages more accurately and 

Word Tokenisation

In [None]:
from nltk.tokenize import word_tokenize
def tokenize_words(text):
    words = word_tokenize(text)
    return words
#Tokenize words
text = input("Enter the text: ")
words = tokenize_words(text)
print(words)

Enter the text: Many languages don’t allow for straight translation and have different orders for sentence structure, which translation services used to overlook.
['Many', 'languages', 'don', '’', 't', 'allow', 'for', 'straight', 'translation', 'and', 'have', 'different', 'orders', 'for', 'sentence', 'structure', ',', 'which', 'translation', 'services', 'used', 'to', 'overlook', '.']


Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
porter = PorterStemmer()
def stem_words(words):
    stemmed_words = [porter.stem(word) for word in words]
    return stemmed_words
#Stemming words
text = input("Enter the text: ")
words = word_tokenize(text)
stemmed_words = stem_words(words)
print(stemmed_words)

Enter the text: This is infinitely helpful when trying to communicate with someone in another language.
['thi', 'is', 'infinit', 'help', 'when', 'tri', 'to', 'commun', 'with', 'someon', 'in', 'anoth', 'languag', '.']


Lemmatization

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
#tokenize text
tokens = word_tokenize(text)
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print(lemmatized_tokens)

['This', 'is', 'infinitely', 'helpful', 'when', 'trying', 'to', 'communicate', 'with', 'someone', 'in', 'another', 'language', '.']


Stop Words

In [None]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
    #Tokenize text into words
    words = word_tokenize(text)
    #get english stopwords
    stop_words = set(stopwords.words('english'))
    #remove stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words
#Remove stopwords
text = input("Enter the text: ")
print(text)
filtered_words = remove_stopwords(text)
print(filtered_words)

Enter the text: With NLP, online translators can translate languages more accurately and present grammatically-correct results.
With NLP, online translators can translate languages more accurately and present grammatically-correct results.
['NLP', ',', 'online', 'translators', 'translate', 'languages', 'accurately', 'present', 'grammatically-correct', 'results', '.']


POS tagging

In [None]:
import nltk
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Pos tagging
from nltk.tokenize import word_tokenize
def pos_tagging(text):
    words = word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

In [None]:
text = input("Enter the text: ")
pos_tagged_text = pos_tagging(text)
print(pos_tagged_text)

Enter the text: Natural language processing (NLP) is the science of getting computers to talk, or interact with humans in human language. Examples of natural language processing include speech recognition, spell check, autocomplete, chatbots, and search engines.
[('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('the', 'DT'), ('science', 'NN'), ('of', 'IN'), ('getting', 'VBG'), ('computers', 'NNS'), ('to', 'TO'), ('talk', 'VB'), (',', ','), ('or', 'CC'), ('interact', 'NN'), ('with', 'IN'), ('humans', 'NNS'), ('in', 'IN'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('Examples', 'NNS'), ('of', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('include', 'VBP'), ('speech', 'JJ'), ('recognition', 'NN'), (',', ','), ('spell', 'RB'), ('check', 'VB'), (',', ','), ('autocomplete', 'VB'), (',', ','), ('chatbots', 'NNS'), (',', ','), ('and', 'CC'), ('search', 'NN'), ('engines', 'NNS'), ('.', '.')]


Name Entity Recognisation (NER)

In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
def ner(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    named_entities = ne_chunk(pos_tags)
    return named_entities
named_entities = ner(text)
print(named_entities)

(S
  Natural/JJ
  language/NN
  processing/NN
  (/(
  (ORGANIZATION NLP/NNP)
  )/)
  is/VBZ
  the/DT
  science/NN
  of/IN
  getting/VBG
  computers/NNS
  to/TO
  talk/VB
  ,/,
  or/CC
  interact/NN
  with/IN
  humans/NNS
  in/IN
  human/JJ
  language/NN
  ./.
  Examples/NNS
  of/IN
  natural/JJ
  language/NN
  processing/NN
  include/VBP
  speech/JJ
  recognition/NN
  ,/,
  spell/RB
  check/VB
  ,/,
  autocomplete/VB
  ,/,
  chatbots/NNS
  ,/,
  and/CC
  search/NN
  engines/NNS
  ./.)


Chunking

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser
#Tokenization
tokens = word_tokenize(text)
#POS tagging
pos_tags = nltk.pos_tag(tokens)
#Chunking patterns
chunk_patterns = r"""
    NP: {<DT>?<JJ>*<NN>}
    VP: {<VB.*><NP|PP>}
"""
#Create a chunk parser
chunk_parser = RegexpParser(chunk_patterns)
result = chunk_parser.parse(pos_tags)
print(result)


(S
  (NP Natural/JJ language/NN)
  (NP processing/NN)
  (/(
  NLP/NNP
  )/)
  (VP is/VBZ (NP the/DT science/NN))
  of/IN
  getting/VBG
  computers/NNS
  to/TO
  talk/VB
  ,/,
  or/CC
  (NP interact/NN)
  with/IN
  humans/NNS
  in/IN
  (NP human/JJ language/NN)
  ./.
  Examples/NNS
  of/IN
  (NP natural/JJ language/NN)
  (NP processing/NN)
  (VP include/VBP (NP speech/JJ recognition/NN))
  ,/,
  spell/RB
  check/VB
  ,/,
  autocomplete/VB
  ,/,
  chatbots/NNS
  ,/,
  and/CC
  (NP search/NN)
  engines/NNS
  ./.)


In [None]:
result.draw