In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk 

In [3]:
example_text = "Hello there, how are you doing today? The weather is great and python is awesome. The sky is pinkish-blue. You should not eat cardboard."

In [4]:
sentences = sent_tokenize(example_text)
print(sentences)
print(len(sentences))
#print(type(abc))

['Hello there, how are you doing today?', 'The weather is great and python is awesome.', 'The sky is pinkish-blue.', 'You should not eat cardboard.']
4


In [5]:
words=word_tokenize(example_text)
print(words)
print(len(words))


['Hello', 'there', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'cardboard', '.']
29


In [6]:
# Stop Words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
example_text = "Hello there, how are you doing today? The weather is great and python is awesome. The sky is pinkish-blue. You should not eat cardboard."

In [7]:
stop_words = set(stopwords.words("English"))
print(type(stop_words))

<class 'set'>


In [8]:
words = word_tokenize(example_text)

In [9]:
filtered_sentence = []

In [10]:
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

In [11]:
print(filtered_sentence)

['Hello', ',', 'today', '?', 'The', 'weather', 'great', 'python', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', 'eat', 'cardboard', '.']


In [12]:
# Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
ps = PorterStemmer()

In [None]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [None]:
for w in example_words:
    print(ps.stem(w))

In [None]:
new_text = "it is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [None]:
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

In [None]:
# POS Tagging
# Unsupervised Machine Learning Tokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
from nltk.tokenize import word_tokenize

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [None]:
# ---- That is how we make our own tokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [None]:
def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            # one line code for POS tagging
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

In [None]:
process_content()

In [None]:
# chunking
def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            #chunked.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
process_content()

In [None]:
# Chinking
# we chink something from a chunk
def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                            }<VB.?|IN|DT|>+{"""
            # <.*>+ = one or more of anything
            # <VB.?|IN|DT|>+ = one or more of Verb, Preposition or Determiner will be CHINKED OUT !!!
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            #chunked.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
# Named Entity Recognition

from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
from nltk.tokenize import word_tokenize

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


In [None]:
def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            # Named Entity
            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()
    except Exception as e:
        print(str(e))

In [None]:
process_content()

In [None]:
# Lemaatizing
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))

In [None]:
# if we give it something and provide its POS tag, thing gets lemmatized
print(lemmatizer.lemmatize("better", pos="a"))
# a = adjective