In [1]:
!pip install nltk



In [2]:
import nltk
nltk.download('punkt') # downloads you a model

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shangjingbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize
text = "God is Great! I won a lottery."
print(word_tokenize(text))

# Output: ['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']


In [4]:
text = "You're the best! Boy's running. The boy's car is awesome! :P :(("
print(word_tokenize(text))

['You', "'re", 'the', 'best', '!', 'Boy', "'s", 'running', '.', 'The', 'boy', "'s", 'car', 'is', 'awesome', '!', ':', 'P', ':', '(', '(']


In [5]:
from nltk.tokenize import sent_tokenize
text = "God is Great! I won a lottery."
print(sent_tokenize(text))

# Output: ['God is Great!', 'I won a lottery ']

['God is Great!', 'I won a lottery.']


In [6]:
text = "The nuggests are priced at $4.99. Are you crazy??! I'm not sure what's the effect the dots....."
print(sent_tokenize(text))

['The nuggests are priced at $4.99.', 'Are you crazy??!', "I'm not sure what's the effect the dots....."]


In [7]:
from nltk.stem import PorterStemmer 

ps = PorterStemmer() 
words = ["program", "programs", "programer", "programing", "programers"] 

for w in words: 
    print(w, " : ", ps.stem(w)) 

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


In [8]:
import nltk
nltk.download('punkt') # downloads you a model

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 

ps = PorterStemmer() 

# return a list of tokens
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return tokens

    
test_case1 = "The nuggests are priced at $4.99. Business Are you fuzzy dity dizzy crazy??! I'm not sure what's the effect the dots....."

print(pre_processing_by_nltk(test_case1, need_sent = True))

[['the', 'nuggest', 'are', 'price', 'at', '$', '4.99', '.'], ['busi', 'are', 'you', 'fuzzi', 'diti', 'dizzi', 'crazi', '?', '?', '!'], ['i', "'m", 'not', 'sure', 'what', "'s", 'the', 'effect', 'the', 'dot', '.....']]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shangjingbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
!pip install spacy
!python -m spacy download en_core_web_sm



In [10]:
import spacy
nlp_processor = spacy.load("en_core_web_sm")

In [11]:
text = "God is Great! I won a lottery."
doc = nlp_processor(text)
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [12]:
for sent in doc.sents:
    print(sent)

God is Great!
I won a lottery.


In [13]:

for word in doc:
    print(type(word))
    print(word.text)

<class 'spacy.tokens.token.Token'>
God
<class 'spacy.tokens.token.Token'>
is
<class 'spacy.tokens.token.Token'>
Great
<class 'spacy.tokens.token.Token'>
!
<class 'spacy.tokens.token.Token'>
I
<class 'spacy.tokens.token.Token'>
won
<class 'spacy.tokens.token.Token'>
a
<class 'spacy.tokens.token.Token'>
lottery
<class 'spacy.tokens.token.Token'>
.


In [14]:
for word in doc:
    print(word.text,  word.lemma_)

God God
is be
Great great
! !
I I
won win
a a
lottery lottery
. .


In [15]:
import en_core_web_sm
nlp_processor = en_core_web_sm.load()

# return a list of tokens
def pre_processing_by_spacy(doc, lemma = True, need_sent = False):
    doc_class = nlp_processor(doc)
    tokens = []
    # step 1: get sentences
    for sent in doc_class.sents:
        # step 2: get tokens
        # step 3 (optional): lemma
        if lemma:
            words = [token.lemma_ for token in sent]
        else:
            words = [token.text for token in sent]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return tokens
pre_processing_by_spacy(test_case1, need_sent = True)

[['the', 'nuggest', 'be', 'price', 'at', '$', '4.99', '.'],
 ['business', 'be', 'you', 'fuzzy', 'dity', 'dizzy', 'crazy', '?', '?', '!'],
 ['I',
  'be',
  'not',
  'sure',
  'what',
  'be',
  'the',
  'effect',
  'the',
  'dot',
  '.....']]

In [16]:
pre_processing_by_spacy("it's good. It's better. It's best")

['it', 'be', 'good', '.', 'it', 'be', 'well', '.', 'it', 'be', 'good']

In [22]:
pre_processing_by_spacy("You're the best! Boy's running. The boy's car is awesome! :P :((")

['you',
 'be',
 'the',
 'good',
 '!',
 'Boy',
 "'s",
 'running',
 '.',
 'the',
 'boy',
 "'s",
 'car',
 'be',
 'awesome',
 '!',
 ':P',
 ':((']

In [17]:
pre_processing_by_spacy("I finished the hw1. I will finish hw1.")

['I', 'finish', 'the', 'hw1', '.', 'I', 'will', 'finish', 'hw1', '.']

In [18]:
pre_processing_by_spacy("I was a programmer. I am a programmer.")

['I', 'be', 'a', 'programmer', '.', 'I', 'be', 'a', 'programmer', '.']

In [19]:
pre_processing_by_nltk("good better best")

['good', 'better', 'best']

In [23]:
pre_processing_by_spacy("good better best")

['good', 'well', 'good']

In [27]:
pre_processing_by_spacy("well good better best")

['well', 'good', 'well', 'good']

In [26]:
pre_processing_by_spacy("it is better.")

['it', 'be', 'well', '.']

In [25]:
pre_processing_by_spacy("did better")

['do', 'well']

In [20]:
pre_processing_by_nltk("I finished the hw1. I will finish hw1.")

['i', 'finish', 'the', 'hw1', '.', 'i', 'will', 'finish', 'hw1', '.']

In [21]:
pre_processing_by_nltk("You were a programmer. I am a programmer.")

['you', 'were', 'a', 'programm', '.', 'i', 'am', 'a', 'programm', '.']