## Tokenization


In [45]:
text = "Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The goal is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful. Applications of NLP include machine translation, sentiment analysis, and chatbots."

In [46]:
word_split = text.split(' ')
word_split

['Natural',
 'language',
 'processing',
 '(NLP)',
 'is',
 'a',
 'field',
 'of',
 'artificial',
 'intelligence',
 'that',
 'focuses',
 'on',
 'the',
 'interaction',
 'between',
 'computers',
 'and',
 'humans',
 'through',
 'natural',
 'language.',
 'The',
 'goal',
 'is',
 'to',
 'enable',
 'computers',
 'to',
 'understand,',
 'interpret,',
 'and',
 'generate',
 'human',
 'language',
 'in',
 'a',
 'way',
 'that',
 'is',
 'both',
 'meaningful',
 'and',
 'useful.',
 'Applications',
 'of',
 'NLP',
 'include',
 'machine',
 'translation,',
 'sentiment',
 'analysis,',
 'and',
 'chatbots.']

In [47]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/saloni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
from nltk import sent_tokenize, word_tokenize

In [49]:
text

'Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The goal is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful. Applications of NLP include machine translation, sentiment analysis, and chatbots.'

In [50]:
## split text into sentences
sentence_tokens= sent_tokenize(text)
sentence_tokens

['Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language.',
 'The goal is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful.',
 'Applications of NLP include machine translation, sentiment analysis, and chatbots.']

In [51]:
##split text into word tokens
word_tokens = word_tokenize(text)
word_tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'artificial',
 'intelligence',
 'that',
 'focuses',
 'on',
 'the',
 'interaction',
 'between',
 'computers',
 'and',
 'humans',
 'through',
 'natural',
 'language',
 '.',
 'The',
 'goal',
 'is',
 'to',
 'enable',
 'computers',
 'to',
 'understand',
 ',',
 'interpret',
 ',',
 'and',
 'generate',
 'human',
 'language',
 'in',
 'a',
 'way',
 'that',
 'is',
 'both',
 'meaningful',
 'and',
 'useful',
 '.',
 'Applications',
 'of',
 'NLP',
 'include',
 'machine',
 'translation',
 ',',
 'sentiment',
 'analysis',
 ',',
 'and',
 'chatbots',
 '.']

## stemming

In [52]:
##add some examples on SnowballStemmer
from nltk.stem import PorterStemmer, SnowballStemmer
porter = PorterStemmer()

In [53]:
original_word = 'works'
porter.stem(original_word)

'work'

In [54]:
stem_word_tokens = (porter.stem(word) for word in word_tokens)
stem_word_tokens

<generator object <genexpr> at 0x1351b4860>

In [55]:
stem_word_tokens = " ".join(porter.stem(word) for word in word_tokens)
stem_word_tokens

'natur languag process ( nlp ) is a field of artifici intellig that focus on the interact between comput and human through natur languag . the goal is to enabl comput to understand , interpret , and gener human languag in a way that is both meaning and use . applic of nlp includ machin translat , sentiment analysi , and chatbot .'

## lemmatization

In [56]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/saloni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [57]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [58]:
lem.lemmatize('fighting')

'fighting'

In [59]:
lem.lemmatize('fighters')

'fighter'

In [60]:
lem.lemmatize('eats')

'eats'

In [61]:
lem.lemmatize('eaten')

'eaten'

In [62]:
lem.lemmatize('feet')

'foot'

In [63]:
lem.lemmatize('learnt','v')

'learn'

In [64]:
lem.lemmatize('stripes', 'v')

'strip'

In [65]:
lem.lemmatize('stripes', 'n')

'stripe'

In [66]:
lem_word_tokens = "  ".join(lem.lemmatize(word.lower()) for word in word_tokens)
lem_word_tokens

'natural  language  processing  (  nlp  )  is  a  field  of  artificial  intelligence  that  focus  on  the  interaction  between  computer  and  human  through  natural  language  .  the  goal  is  to  enable  computer  to  understand  ,  interpret  ,  and  generate  human  language  in  a  way  that  is  both  meaningful  and  useful  .  application  of  nlp  include  machine  translation  ,  sentiment  analysis  ,  and  chatbots  .'

## POS Tagging
36 tags available

In [69]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/saloni/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [78]:
from nltk import pos_tag

In [79]:
pos_tag(['countries'])

[('countries', 'NNS')]

In [80]:
pos_tag(['laughed'])

[('laughed', 'VBN')]

In [82]:
pos_tag(['walking'])

[('walking', 'VBG')]

In [83]:
pos_tag(['work'])

[('work', 'NN')]

In [85]:
pos_tag(word_tokens)

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('field', 'NN'),
 ('of', 'IN'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('that', 'WDT'),
 ('focuses', 'VBZ'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('interaction', 'NN'),
 ('between', 'IN'),
 ('computers', 'NNS'),
 ('and', 'CC'),
 ('humans', 'NNS'),
 ('through', 'IN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('goal', 'NN'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('enable', 'JJ'),
 ('computers', 'NNS'),
 ('to', 'TO'),
 ('understand', 'VB'),
 (',', ','),
 ('interpret', 'VB'),
 (',', ','),
 ('and', 'CC'),
 ('generate', 'VB'),
 ('human', 'JJ'),
 ('language', 'NN'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('way', 'NN'),
 ('that', 'WDT'),
 ('is', 'VBZ'),
 ('both', 'DT'),
 ('meaningful', 'JJ'),
 ('and', 'CC'),
 ('useful', 'JJ'),
 ('.', '.'),
 ('Applications', 'NNS'),
 ('of', 'IN'),
 ('NLP', 'NNP'),
 ('include', 'VBP'),
 ('machine', 'NN'),
 

## Text Preprocessing