In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CG0002\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
#---------------------------- Tokenizing Text -----------------------------------------------#
# nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize

example_text = "I want to be a certified artificial intelligence professional"

print(sent_tokenize(example_text))

print(word_tokenize(example_text))

for i in word_tokenize(example_text):print(i)

['I want to be a certified artificial intelligence professional']
['I', 'want', 'to', 'be', 'a', 'certified', 'artificial', 'intelligence', 'professional']
I
want
to
be
a
certified
artificial
intelligence
professional


In [11]:
# ------------ bigrams ----------------#

word_data = 'I want to be a certified artificial intelligence professional'
nltk_tokens = nltk.word_tokenize(word_data)
print(list(nltk.bigrams(nltk_tokens)))

[('I', 'want'), ('want', 'to'), ('to', 'be'), ('be', 'a'), ('a', 'certified'), ('certified', 'artificial'), ('artificial', 'intelligence'), ('intelligence', 'professional')]


In [12]:
# --------------- n-gram --------------#

from nltk.util import ngrams

def word_grams(words, min=1, max=5):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

print(word_grams(nltk_tokens))

['I', 'want', 'to', 'be', 'a', 'certified', 'artificial', 'intelligence', 'professional', 'I want', 'want to', 'to be', 'be a', 'a certified', 'certified artificial', 'artificial intelligence', 'intelligence professional', 'I want to', 'want to be', 'to be a', 'be a certified', 'a certified artificial', 'certified artificial intelligence', 'artificial intelligence professional', 'I want to be', 'want to be a', 'to be a certified', 'be a certified artificial', 'a certified artificial intelligence', 'certified artificial intelligence professional']


In [13]:
# ------------- Removing Stop Words ------------------------ #

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#example_text = "This is an example showing off stop word filtration."
example_text = 'I want to be a certified artificial intelligence professional'

stop_words = set(stopwords.words("english"))

print("List of the Stop words")
print(stop_words)

words = word_tokenize(example_text)

filtered_sentence = []

for w in words:
	if w not in stop_words:
		filtered_sentence.append(w)

print(filtered_sentence)

List of the Stop words
{'isn', 'had', 'aren', 'can', 'hers', 'itself', 'what', 'haven', 'did', 'through', 'to', 'if', 'under', 'most', "isn't", "doesn't", 'ma', 'be', 'doesn', 'y', 'm', 't', 'couldn', 'more', 'needn', 'they', "won't", 'yourself', "you're", 'we', 'very', 'again', 'have', 'until', 'who', "should've", 'from', 'do', "aren't", 'his', 'was', 'then', 'themselves', "you'd", 'between', 'below', 'does', 'up', 'further', 'own', 'herself', 'weren', "you've", 'about', 'no', 'too', 'here', 'ours', 'd', 'is', "shouldn't", 'down', 'being', 'shouldn', 's', "weren't", "didn't", 'these', 'their', 'o', 'its', 'those', "wouldn't", 'it', 'myself', 'during', "wasn't", 'me', 'yourselves', 'this', 'only', 'in', 'our', 'them', 'her', 'some', 'wasn', 'won', 'were', 'ain', "she's", 'or', 'so', 'shan', 'whom', 'off', 'should', 'you', 'he', 'wouldn', 'having', 'yours', 'above', 'on', 'just', 'i', "don't", 'll', 'once', 'which', 'any', 'by', "needn't", 'him', 'nor', "hadn't", 'how', 'hasn', 'an', 'm

In [14]:
#------------- Normalization: Stemming and Lemmatization ---------------# 

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
	print(ps.stem(w))


python
python
python
python
pythonli


In [15]:
#------------- Stemming - Another Example ---------------- #
new_text = "It is very important to be pythonly while you are pythoning with python.Python name is derived from the pythons"

words=word_tokenize(new_text)

for w in words:
	print(ps.stem(w))


It
is
veri
import
to
be
pythonli
while
you
are
python
with
python.python
name
is
deriv
from
the
python


In [16]:
#----------------- Lemmatization Example --------------------#
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 
  
print("rocks when lemmatized :", lemmatizer.lemmatize("rocks")) 
print("corpora when lemmatized :", lemmatizer.lemmatize("corpora"))

#ps = PorterStemmer()
print("rocks when Stemmed :", ps.stem("rocks")) 
print("corpora when Stemmed :", ps.stem("corpora"))

# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a")) 

rocks when lemmatized : rock
corpora when lemmatized : corpus
rocks when Stemmed : rock
corpora when Stemmed : corpora
better : good


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CG0002\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
#---------- Parts of Speech Tagging ----------# 
nltk.download('averaged_perceptron_tagger')
example_text = "The training is going great and the day is very fine.The code is working and all are happy about it"
token = nltk.word_tokenize(example_text)

nltk.pos_tag(token)

nltk.download('tagsets')

# We can get more details about any POS tag using help funciton of NLTK as follows.
nltk.help.upenn_tagset("PRP$")
nltk.help.upenn_tagset("JJ$")
nltk.help.upenn_tagset("VBG")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\CG0002\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\CG0002\AppData\Roaming\nltk_data...


PRP$: pronoun, possessive
    her his mine my our ours their thy your
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...


[nltk_data]   Unzipping help\tagsets.zip.


In [1]:
#--------------------- Named Entity Recognition using Spacy -------------------------------#
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import pprint
# Run in console  python -m spacy download en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

print([(X.text, X.label_) for X in doc.ents])

print([(X, X.ent_iob_, X.ent_type_) for X in doc])

sentences = [x for x in doc.ents]

print(sentences)

displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

#displacy.serve(nlp(str(sentences)), style='ent')

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]
[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (Google, 'B', 'ORG'), (a, 'O', ''), (record, 'O', ''), ($, 'B', 'MONEY'), (5.1, 'I', 'MONEY'), (billion, 'I', 'MONEY'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]
[European, Google, $5.1 billion, Wednesday]


In [2]:
x1 = spacy.explain('NORP')
print(x1)

Nationalities or religious or political groups
