<a href="https://colab.research.google.com/github/ru81git454/textmining/blob/master/02_Text_to_Num_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#From Textual Information to Numerical Vector - Sample Code

##Getting Started

In [0]:
text ="I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I couldn't learn what it had to teach, and not, when I came to die, discover that I had not lived."

In [0]:
print(text)

I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I couldn't learn what it had to teach, and not, when I came to die, discover that I had not lived.


##Word Tokenization

In [0]:
# tokenization using wordpunct_tokenize function
import nltk  #nltk stand for natural language toolkit
from nltk import wordpunct_tokenize
punct_token = wordpunct_tokenize(text)
print(punct_token)

['I', 'went', 'to', 'the', 'woods', 'because', 'I', 'wished', 'to', 'live', 'deliberately', ',', 'to', 'front', 'only', 'the', 'essential', 'facts', 'of', 'life', ',', 'and', 'see', 'if', 'I', 'couldn', "'", 't', 'learn', 'what', 'it', 'had', 'to', 'teach', ',', 'and', 'not', ',', 'when', 'I', 'came', 'to', 'die', ',', 'discover', 'that', 'I', 'had', 'not', 'lived', '.']


In [0]:
## tokenization using word_tokenize function
nltk.download('punkt')
tokens = nltk.word_tokenize(text)
print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['I', 'went', 'to', 'the', 'woods', 'because', 'I', 'wished', 'to', 'live', 'deliberately', ',', 'to', 'front', 'only', 'the', 'essential', 'facts', 'of', 'life', ',', 'and', 'see', 'if', 'I', 'could', "n't", 'learn', 'what', 'it', 'had', 'to', 'teach', ',', 'and', 'not', ',', 'when', 'I', 'came', 'to', 'die', ',', 'discover', 'that', 'I', 'had', 'not', 'lived', '.']


## Stemming
The nltk package has several implementations for stemmers. These stemmers are
implemented in the stem module. Here we use PorterStemmer and  LancasterStemmer

In [0]:
#The Porter stemming algorithm
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print (ps.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'))
print (ps.stem('lying'), ps.stem('strange'))

jump jump jump
lie strang


In [0]:
#The Lancaster stemming algorithm
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
print (ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'))
print (ls.stem('lying'), ls.stem('strange'))


jump jump jump
lying strange


In [0]:
#stemming an tokenize word list
token_stem = [ps.stem(w) for w in tokens]
print(token_stem)

['I', 'went', 'to', 'the', 'wood', 'becaus', 'I', 'wish', 'to', 'live', 'deliber', ',', 'to', 'front', 'onli', 'the', 'essenti', 'fact', 'of', 'life', ',', 'and', 'see', 'if', 'I', 'could', "n't", 'learn', 'what', 'it', 'had', 'to', 'teach', ',', 'and', 'not', ',', 'when', 'I', 'came', 'to', 'die', ',', 'discov', 'that', 'I', 'had', 'not', 'live', '.']


#Lemmatization

In [0]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
# lemmatize verb/nouns
print (wnl.lemmatize('lying', "v"))
print (wnl.lemmatize('lying', "n"))
print (wnl.lemmatize('lying')) # with out POS, noun is default
print (wnl.lemmatize('saddest', 'a'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
lie
lying
lying
sad


##Remove stopwords

In [0]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
print(stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
{'d', 'am', 'into', "you've", "you'd", 'during', "needn't", 'against', "couldn't", 'below', 'under', "doesn't", 'mustn', "it's", 'himself', 'whom', 'there', 'he', 'doing', "isn't", 'then', 'same', 'won', 'that', 'them', 're', 'didn', "haven't", "shan't", 'about', 'don', 'these', 'weren', 'between', 'of', 'above', 'yours', 'been', 'they', 'few', 'having', 'where', 'a', 'ma', 'when', 'the', 'what', 'shan', 'hadn', 'themselves', "mightn't", "weren't", 'hasn', 'through', 'isn', 'up', 'each', 'herself', 'to', 'from', 'has', 'his', 'she', 'their', 'this', 'did', 'her', 'no', "should've", "wouldn't", 'as', 'an', 'here', 'if', 'over', 'needn', 'my', 'are', 'ourselves', 'not', 'again', 's', 'but', 'your', 'only', 'couldn', 'being', 'we', 'be', 'most', 'than', "shouldn't", 'after', 'haven', "wasn't", 'for', 'those', 'more', "aren't", 'do', 'now', 'such', 't', 'had', 'ain', "didn't", 'my

Add punctuation into stopword set

In [0]:

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
filtered_tokens = [w for w in tokens if not w in stopwords]
print(filtered_tokens)

['I', 'went', 'woods', 'I', 'wished', 'live', 'deliberately', ',', 'front', 'essential', 'facts', 'life', ',', 'see', 'I', 'could', "n't", 'learn', 'teach', ',', ',', 'I', 'came', 'die', ',', 'discover', 'I', 'lived', '.']


#Part of Speed (POS) Tagging


In [0]:
import nltk
#averaged_perceptron_tagger
token_tag = nltk.word_tokenize("I went to the woods because I wished to live deliberately, to front only the essential facts of life")
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
pos_tokens = nltk.pos_tag(token_tag)
print(pos_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[('I', 'PRP'), ('went', 'VBD'), ('to', 'TO'), ('the', 'DT'), ('woods', 'NNS'), ('because', 'IN'), ('I', 'PRP'), ('wished', 'VBD'), ('to', 'TO'), ('live', 'VB'), ('deliberately', 'RB'), (',', ','), ('to', 'TO'), ('front', 'VB'), ('only', 'RB'), ('the', 'DT'), ('essential', 'JJ'), ('facts', 'NNS'), ('of', 'IN'), ('life', 'NN')]


In [0]:
nltk.download('universal_tagset')
pos_tokens = nltk.pos_tag(token_tag,tagset='universal')
print(pos_tokens)

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[('I', 'PRON'), ('went', 'VERB'), ('to', 'PRT'), ('the', 'DET'), ('woods', 'NOUN'), ('because', 'ADP'), ('I', 'PRON'), ('wished', 'VERB'), ('to', 'PRT'), ('live', 'VERB'), ('deliberately', 'ADV'), (',', '.'), ('to', 'PRT'), ('front', 'VERB'), ('only', 'ADV'), ('the', 'DET'), ('essential', 'ADJ'), ('facts', 'NOUN'), ('of', 'ADP'), ('life', 'NOUN')]


# Text Normalization

In [0]:
sentence="I went to the woods because I wished to live deliberately, to front only the essential facts of life..."

In [0]:
sentence.lower()

'i went to the woods because i wished to live deliberately, to front only the essential facts of life'

In [0]:
sentence.upper()

In [0]:
sentence.capitalize()

In [0]:
sentence[0]

In [0]:
sentence[-1]

In [0]:
sentence[1:10]

In [0]:
"A".isupper()

In [0]:
'woods'.isalpha()

In [0]:
'20'.isdigit()

In [0]:
'20'.isdecimal()

In [0]:
'12ab'.isalnum() 

In [0]:
import re
sentence="I went to the woods because I wished to live deliberately, to front only the essential facts of life..."
text_removed_pun=re.sub(r'[^\w]',' ',sentence) 
print(text_removed_pun)

I went to the woods because I wished to live deliberately  to front only the essential facts of life   


In [0]:
#tokenize using split()
tokens_split = text_removed_pun.lower().split()
print (tokens_split )

['i', 'went', 'to', 'the', 'woods', 'because', 'i', 'wished', 'to', 'live', 'deliberately', 'to', 'front', 'only', 'the', 'essential', 'facts', 'of', 'life']


#Term Frequency

In [0]:
import nltk

In [0]:
from collections import Counter
counter_text = Counter(tokens_split)
print(counter_text)

Counter({'to': 3, 'i': 2, 'the': 2, 'went': 1, 'woods': 1, 'because': 1, 'wished': 1, 'live': 1, 'deliberately': 1, 'front': 1, 'only': 1, 'essential': 1, 'facts': 1, 'of': 1, 'life': 1})


In [0]:
# calculate the word frequency
fdist = nltk.FreqDist(tokens_split)
fdist

FreqDist({'because': 1,
          'deliberately': 1,
          'essential': 1,
          'facts': 1,
          'front': 1,
          'i': 2,
          'life': 1,
          'live': 1,
          'of': 1,
          'only': 1,
          'the': 2,
          'to': 3,
          'went': 1,
          'wished': 1,
          'woods': 1})

In [0]:
from tabulate import tabulate
# print out the most 10 common word in filtered_hamlet
print (tabulate(fdist.most_common(10), headers=["word","times"]))

word            times
------------  -------
to                  3
i                   2
the                 2
went                1
woods               1
because             1
wished              1
live                1
deliberately        1
front               1


#Tf-idf

In [0]:
doc1="this is a good sample"
doc2= "this is another example another example example"
doc1_tokens= doc1.split()
doc2_tokens=doc2.split()
print("doc1_token:",doc1_tokens)
print("doc2_token:",doc2_tokens)

doc1_token: ['this', 'is', 'a', 'good', 'sample']
doc2_token: ['this', 'is', 'another', 'example', 'another', 'example', 'example']


In [0]:
# create tf function
def tf(term, token_doc):
    tf = token_doc.count(term)/len(token_doc)
    return tf

In [0]:
print("tf(this,doc1)=",tf("this",doc1_tokens))
print("tf(example,doc2)=",tf("example",doc2_tokens))

tf(this,doc1)= 0.2
tf(example,doc2)= 0.42857142857142855


In [0]:
token_doclist =[doc1_tokens, doc2_tokens]
print ("Doc list: ", token_doclist)

Doc list:  [['this', 'is', 'a', 'good', 'sample'], ['this', 'is', 'another', 'example', 'another', 'example', 'example']]


In [0]:
# create function to calculate how many doc contain the term 
def numDocsContaining(word, token_doclist):
    doccount = 0
    for doc_token in token_doclist:
        if doc_token.count(word) > 0:
            doccount +=1
    return doccount 

In [0]:
print ("numDocsContaining 'this':",numDocsContaining("this",token_doclist))
print ("numDocsContaining 'example':",numDocsContaining("example",token_doclist))

numDocsContaining 'this': 2
numDocsContaining 'example': 1


In [0]:
import math
# create function to calculate  Inverse Document Frequency in doclist
def idf(word, token_doclist):
    n = len(token_doclist)
    df = numDocsContaining(word, token_doclist)
    return math.log10(n/df)

In [0]:
print ("idf of 'this':",idf("this",token_doclist))
print ("idf of 'example':",idf("example",token_doclist))

idf of 'this': 0.0
idf of 'example': 0.3010299956639812


In [0]:
print ("tfidf of 'this' in doc1:", tf("this",doc1_tokens)*idf("this",token_doclist))
print ("tfidf of 'example' in doc1:", tf("example",doc1_tokens)*idf("example",token_doclist))
print ("tfidf of 'example' in doc2:", tf("example",doc2_tokens)*idf("example",token_doclist))
                                                                    

tfidf of 'this' in doc1: 0.0
tfidf of 'example' in doc1: 0.0
tfidf of 'example' in doc2: 0.12901285528456335


#Count vectorizer

In [0]:
doc1="this is a good sample"
doc2= "this is another example another example example"
doc1_tokens= doc1.split()
doc2_tokens=doc2.split()

In [0]:
from collections import Counter
print("Doc 1 counter:", Counter(doc1_tokens))
print("Doc 2 counter:", Counter(doc2_tokens))

Doc 1 counter: Counter({'this': 1, 'is': 1, 'a': 1, 'python': 1, 'sample': 1})
Doc 2 counter: Counter({'example': 3, 'another': 2, 'this': 1, 'is': 1})


#TFIDF vectorizer

In [0]:
# define function to calculate tfidf
def tfidf(word, doc, token_doclist):
  tfidf = tf(word,doc)* idf(word,token_doclist)
  return tfidf

In [0]:
for term in set(doc1_tokens):
   term_tfidf= tfidf(term,doc1_tokens,token_doclist)
   print(term,":", round(term_tfidf,4))
  

this : 0.0
is : 0.0
a : 0.0602
good : 0.0602
sample : 0.0602


In [0]:
for term in set(doc2_tokens):
   term_tfidf= tfidf(term,doc2_tokens,token_doclist)
   print(term,":", round(term_tfidf,4))
  

example : 0.129
another : 0.086
is : 0.0
this : 0.0


#Exercise

Download "shakespeare-macbeth" sample of texts from Project Gutenberg appears in the NLTK corpus collection

Compute the word frequencies in "shakespeare-macbeth.txt"
- read sentences from file "**shakespeare**-macbeth.txt"
- split sentences into words (using split(), or nltk word_tokenize)
- filter out symbols use methods:(isalpha, isdigit, isalnum)and stop words
- normalize words and count ('Word' and 'word' are considered as the same word)
- count the occurence of words.


In [0]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
print(nltk.corpus.gutenberg.fileids())
macbeth_text = open('/root/nltk_data/corpora/gutenberg/shakespeare-macbeth.txt').read()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [0]:
print(macbeth_text)