In [None]:
!pip install spacy



In [None]:
import nltk
import spacy


In [None]:
nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### **Tokenizing**

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize,TreebankWordTokenizer
text= "stopwords are common words that are often removed from text during the preprocessing phase of natural language processing task's because they are considered to carry little meaning. It is good."
tokens = word_tokenize(text)
sent = sent_tokenize(text)
wordpuc = wordpunct_tokenize(text)
tree = TreebankWordTokenizer()
tok = tree.tokenize(text)
print(tokens)
print(sent)
print(wordpuc)
print(tok)

['stopwords', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', 'from', 'text', 'during', 'the', 'preprocessing', 'phase', 'of', 'natural', 'language', 'processing', 'task', "'s", 'because', 'they', 'are', 'considered', 'to', 'carry', 'little', 'meaning', '.', 'It', 'is', 'good', '.']
["stopwords are common words that are often removed from text during the preprocessing phase of natural language processing task's because they are considered to carry little meaning.", 'It is good.']
['stopwords', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', 'from', 'text', 'during', 'the', 'preprocessing', 'phase', 'of', 'natural', 'language', 'processing', 'task', "'", 's', 'because', 'they', 'are', 'considered', 'to', 'carry', 'little', 'meaning', '.', 'It', 'is', 'good', '.']
['stopwords', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', 'from', 'text', 'during', 'the', 'preprocessing', 'phase', 'of', 'natural', 'language', 'processing', 'task', "'s", 'because',

### **Removing Stopwords**

In [None]:
from  nltk.corpus import stopwords
sw=stopwords.words('english')
stopwords_removed = [word for word in tokens if word.lower() not in sw]
print(stopwords_removed)
print(tokens)

['stopwords', 'common', 'words', 'often', 'removed', 'text', 'preprocessing', 'phase', 'natural', 'language', 'processing', 'task', "'s", 'considered', 'carry', 'little', 'meaning', '.', 'good', '.']
['stopwords', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', 'from', 'text', 'during', 'the', 'preprocessing', 'phase', 'of', 'natural', 'language', 'processing', 'task', "'s", 'because', 'they', 'are', 'considered', 'to', 'carry', 'little', 'meaning', '.', 'It', 'is', 'good', '.']


### **Stemming**

In [None]:

from nltk.stem import SnowballStemmer, PorterStemmer, RegexpStemmer
from nltk import tokenize

ss=SnowballStemmer('english')
text = 'Stemming helps to reduce words to their base form, improving text analysis.'
tokens = word_tokenize(text)
stemmed = [ss.stem(token) for token in tokens]
print(tokens)
print(stemmed)

['Stemming', 'helps', 'to', 'reduce', 'words', 'to', 'their', 'base', 'form', ',', 'improving', 'text', 'analysis', '.']
['stem', 'help', 'to', 'reduc', 'word', 'to', 'their', 'base', 'form', ',', 'improv', 'text', 'analysi', '.']


In [None]:
ps = PorterStemmer()
tokens = word_tokenize(text)
stemmed = [ps.stem(token) for token in tokens]
print(tokens)
print(stemmed)

['Stemming', 'helps', 'to', 'reduce', 'words', 'to', 'their', 'base', 'form', ',', 'improving', 'text', 'analysis', '.']
['stem', 'help', 'to', 'reduc', 'word', 'to', 'their', 'base', 'form', ',', 'improv', 'text', 'analysi', '.']


In [None]:
reg = RegexpStemmer('ing$|s$|e$|able$', min = 4)
tokens = word_tokenize(text)
stemmed = [reg.stem(token) for token in tokens]
print(tokens)
print(stemmed)


['Stemming', 'helps', 'to', 'reduce', 'words', 'to', 'their', 'base', 'form', ',', 'improving', 'text', 'analysis', '.']
['Stemm', 'help', 'to', 'reduc', 'word', 'to', 'their', 'bas', 'form', ',', 'improv', 'text', 'analysi', '.']


### **Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
tokens = word_tokenize(text)
tok =[lem.lemmatize(words,pos='v') for words in tokens]
print(tokens)
print(tok)
# v - verb
# n - noun
# r - adverb
# a - adjective


['Stemming', 'helps', 'to', 'reduce', 'words', 'to', 'their', 'base', 'form', ',', 'improving', 'text', 'analysis', '.']
['Stemming', 'help', 'to', 'reduce', 'word', 'to', 'their', 'base', 'form', ',', 'improve', 'text', 'analysis', '.']


### **Parts of speech**

In [None]:
tags =nltk.pos_tag(tokens)
tags

[('stopwords', 'NNS'),
 ('are', 'VBP'),
 ('common', 'JJ'),
 ('words', 'NNS'),
 ('that', 'WDT'),
 ('are', 'VBP'),
 ('often', 'RB'),
 ('removed', 'VBN'),
 ('from', 'IN'),
 ('text', 'NN'),
 ('during', 'IN'),
 ('the', 'DT'),
 ('preprocessing', 'VBG'),
 ('phase', 'NN'),
 ('of', 'IN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('task', 'NN'),
 ("'s", 'POS'),
 ('because', 'IN'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('considered', 'VBN'),
 ('to', 'TO'),
 ('carry', 'VB'),
 ('little', 'JJ'),
 ('meaning', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('good', 'JJ'),
 ('.', '.')]

### **Name entity tag**

In [None]:
#text = "In Paris, John Smith attended a conference on artificial intelligence last Friday at 9.30 pm."
#tokens = word_tokenize(text)
#tags =nltk.pos_tag(tokens)
#nltk.ne_chunk(tags).draw()

In [None]:
spc = spacy.load("en_core_web_sm")

text = "In Paris, John Smith attended a conference on artificial intelligence last Friday at 9.30 pm."
doc = spc(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]

print(entities)


[('Paris', 'GPE'), ('John Smith', 'PERSON'), ('last Friday', 'DATE'), ('9.30 pm', 'TIME')]
