In [1]:
import nltk
nltk.download('punkt') #tokenizer
nltk.download('wordnet') #lemmatizer
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger') # Part of Speech

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jaehyungroh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaehyungroh/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jaehyungroh/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaehyungroh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jaehyungroh/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
d1 = "He is a good guy, he is not bad"
d2 = "feet wolves cooked boys girls, !<@!"
d3 = "He is not a good guy, he is bad"

c1 = [d1, d2, d3]

In [3]:
# tokenizer 
token_d1 = nltk.word_tokenize(d1)
print(token_d1)

['He', 'is', 'a', 'good', 'guy', ',', 'he', 'is', 'not', 'bad']


In [4]:
# tokenizer w/ whitespace
tokenizer1 = nltk.tokenize.WhitespaceTokenizer()
token_d12 = tokenizer1.tokenize(d1)
print(token_d12)

['He', 'is', 'a', 'good', 'guy,', 'he', 'is', 'not', 'bad']


In [5]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer1 = CountVectorizer()
vectorizer1.fit(c1) # find the basis for vector: bag of words
print(vectorizer1.vocabulary_) # number represent index

{'he': 7, 'is': 8, 'good': 5, 'guy': 6, 'not': 9, 'bad': 0, 'feet': 3, 'wolves': 10, 'cooked': 2, 'boys': 1, 'girls': 4}


In [7]:
v1 = vectorizer1.transform(c1)
print(v1.toarray()) # vector representation of counts list of words

[[1 0 0 0 0 1 1 2 2 1 0]
 [0 1 1 1 1 0 0 0 0 0 1]
 [1 0 0 0 0 1 1 2 2 1 0]]


In [8]:
# Stemming - the process of transforming a word into its stem(root)

stemmer = nltk.stem.PorterStemmer()
token_d2 = nltk.word_tokenize(d2)
print(token_d2)

['feet', 'wolves', 'cooked', 'boys', 'girls', ',', '!', '<', '@', '!']


In [9]:
#loop to iterate to the root
stemmered_token_d2 = [stemmer.stem(token) for token in token_d2 
                      if token.isalpha()]
print(stemmered_token_d2)

['feet', 'wolv', 'cook', 'boy', 'girl']


In [10]:
# Lemmatization - the process to use a pre-defined dictionary to lookup lemmas
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_token_d2 = []

for token in token_d2:
    if token.isalpha():
        lemmatized_token_d2.append(lemmatizer.lemmatize(token))
        # if word, change the word into rootform using lematizer and put into the list

print(lemmatized_token_d2)

# often used Stemmer & Lemmatization to cleanup

['foot', 'wolf', 'cooked', 'boy', 'girl']


In [12]:
# Stop words - articles, prepositions 
from nltk.corpus import stopwords

stopwords_removed = [token for token in token_d1 
                     if not token in stopwords.words('english')]
print(stopwords_removed)

['He', 'good', 'guy', ',', 'bad']


In [13]:
# low frequency
vectorizer2 = CountVectorizer(min_df = 2)
vectorizer2.fit(c1)
print(vectorizer2.vocabulary_)

{'he': 3, 'is': 4, 'good': 1, 'guy': 2, 'not': 5, 'bad': 0}


In [14]:
# Term Frequency - Inversed Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer3 = TfidfVectorizer()
vectorizer3.fit(c1)
v3 = vectorizer3.transform(c1)
print(v3.toarray())

[[0.28867513 0.         0.         0.         0.         0.28867513
  0.28867513 0.57735027 0.57735027 0.28867513 0.        ]
 [0.         0.4472136  0.4472136  0.4472136  0.4472136  0.
  0.         0.         0.         0.         0.4472136 ]
 [0.28867513 0.         0.         0.         0.         0.28867513
  0.28867513 0.57735027 0.57735027 0.28867513 0.        ]]


In [16]:
# ngram
vectorizer4 = TfidfVectorizer(ngram_range = (1,2), min_df=2)
vectorizer4.fit(c1)
print(vectorizer4.vocabulary_)

{'he': 5, 'is': 7, 'good': 1, 'guy': 3, 'not': 9, 'bad': 0, 'he is': 6, 'good guy': 2, 'guy he': 4, 'is not': 8}


In [17]:
# Part of Speech (POS)
d4 = "I drink water in parties"
d5 = "I grab a drink in parties"

c2 = [d4, d5]

token_d4 = nltk.word_tokenize(d4)
POS_token_d4 = nltk.pos_tag(token_d4)

print(POS_token_d4)

[('I', 'PRP'), ('drink', 'VBP'), ('water', 'NN'), ('in', 'IN'), ('parties', 'NNS')]


In [19]:
# especially for assignment
# 1. tokenize
# 2. Stemming & Lemmatizing & remove Stopwords & POS tag
# 3. Returns with list of tokens
# 4. CANNOT be accepted into vectorizer - will cause syntaxerror
# 5. Before applying vectorizer, join list of tokens back to text (sentences) for each document

POS_c2 = [] # list of re-joined docs after applying POS tag

for doc in c2:
    token_doc = nltk.word_tokenize(doc)
    POS_token_doc = nltk.pos_tag(token_doc)
    temp = [] # to store the concated token (token_POS)
    for i in POS_token_doc:
        temp.append(i[0]+"_"+i[1])
        
    # join the POS tagged taken back to a doc (sentence)
    POS_c2.append(" ".join(temp)) # shorter version
    #POS_c2.append(" ".join(j for j in temp)) - alternative way

print(POS_c2)
print(c2)

['I_PRP drink_VBP water_NN in_IN parties_NNS', 'I_PRP grab_VBP a_DT drink_NN in_IN parties_NNS']
['I drink water in parties', 'I grab a drink in parties']
