In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Tokenization

In [8]:
lTokenizer = LineTokenizer()
print("Line tokenizer out :", lTokenizer.tokenize("My name is Maximus \n and you?"))
      
rawText= "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer 출력: ", sTokenizer.tokenize(rawText))

print("Word Tokenizer 출력: ", word_tokenize(rawText))
      
tTokenizer = TweetTokenizer()
print("Tweet Tokenizer 출력: ", tTokenizer.tokenize("This is a coooool #dummysmiley: :-) :-P"))

Line tokenizer out : ['My name is Maximus ', ' and you?']
Space Tokenizer 출력:  ['By', '11', "o'clock", 'on', 'Sunday,', 'the', 'doctor', 'shall', 'open', 'the', 'dispensary.']
Word Tokenizer 출력:  ['By', '11', "o'clock", 'on', 'Sunday', ',', 'the', 'doctor', 'shall', 'open', 'the', 'dispensary', '.']
Tweet Tokenizer 출력:  ['This', 'is', 'a', 'coooool', '#dummysmiley', ':', ':-)', ':-P']


# Stemming

In [10]:
from nltk import PorterStemmer, LancasterStemmer, word_tokenize

In [12]:
raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North"
tokens = word_tokenize(raw)

porter = PorterStemmer()
pStems = [porter.stem(t) for t in tokens]
print(pStems)

lancaster = LancasterStemmer()
lStems = [lancaster.stem(t) for t in tokens]
print(lStems)

['My', 'name', 'is', 'maximu', 'decimu', 'meridiu', ',', 'command', 'of', 'the', 'armi', 'of', 'the', 'north']
['my', 'nam', 'is', 'maxim', 'decim', 'meridi', ',', 'command', 'of', 'the', 'army', 'of', 'the', 'nor']


# Lemmatization

In [14]:
import nltk
nltk.download('wordnet')
from nltk import word_tokenize, PorterStemmer, WordNetLemmatizer
raw = "My name is Maximus Decimus Meridius, commander of the armies of the north, General of the Felix"
tokens = word_tokenize(raw)
porter = PorterStemmer()
stems = [porter.stem(t) for t in tokens]
print(stems)
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in tokens]
print(lemmas)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['My', 'name', 'is', 'maximu', 'decimu', 'meridiu', ',', 'command', 'of', 'the', 'armi', 'of', 'the', 'north', ',', 'gener', 'of', 'the', 'felix']
['My', 'name', 'is', 'Maximus', 'Decimus', 'Meridius', ',', 'commander', 'of', 'the', 'army', 'of', 'the', 'north', ',', 'General', 'of', 'the', 'Felix']


In [15]:
import nltk
nltk.download('gutenberg')
nltk.download('stopwords')
from nltk.corpus import gutenberg
print(gutenberg.fileids())
gb_words = gutenberg.words('bible-kjv.txt')

words_filtered = [e.lower() for e in gb_words if len(e) >= 3]
stopwords = nltk.corpus.stopwords.words('english')
words = [w for w in words_filtered if w.lower() not in stopwords]
fdist = nltk.FreqDist(words)
fdist2 = nltk.FreqDist(gb_words)
print('Following are the most common 10 words in the bag')
print(fdist2.most_common(10))
print('Following are the most common 10 words in the bag minus the stopwords')
print(fdist.most_common(10))
fdist.plot()

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Following are the most common 10 words in the bag
[(',', 70509), ('the', 62103), (':', 43766), ('and', 38847), ('of', 34480), ('.', 26160), ('to', 13396), ('And', 12846), ('that', 12576), ('in', 12331)]
Following are the most common 10 words in the bag minus the stopwords
[('shall', 9838), ('unto', 8997), ('lord', 7964), ('thou', 5474), ('thy', 4600), ('god', 4472), ('said', 3999), ('thee', 3827), ('upon', 2748), ('man', 2735)]


<Figure size 640x480 with 1 Axes>

# 편집거리 측정

In [17]:
from nltk.metrics.distance import edit_distance

def my_edit_distance(str1, str2):
    m = len(str1) + 1
    n = len(str2) + 1
    
    table = {}
    for i in range(m): table[i,0]=i
    for j in range(n): table[0,j]=j
    
    for i in range(1,m):
        for j in range(1,n):
            cost = 0 if str1[i-1] == str2[j-1] else 1
            table[i,j] = min(table[i, j-1]+1, table[i-1, j]+1, table[i-1, j-1]+cost)
    return table[i,j]

print("Our Algorithm:", my_edit_distance("hand","and"))
print("NLTK Algorithm:", edit_distance("hand", "and"))

Our Algorithm: 1
NLTK Algorithm: 1
