# Vectorization of Tekenized Texts

In [1]:
import nltk
import string

In [2]:
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [3]:
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

In [4]:
corpus = [
    "The faster kishore got to the store and store got", 
    "the faster kishore got out got out again",
    "the faster kishore got home faster home home home",
]

In [5]:
from collections import defaultdict

def vectorize(doc):
    features = defaultdict(int)
    for token in tokenize(doc):
        features[token] += 1
    return features

vectors = map(vectorize, corpus)

In [6]:
list(vectors)

[defaultdict(int,
             {'the': 2,
              'faster': 1,
              'kishor': 1,
              'got': 2,
              'to': 1,
              'store': 2,
              'and': 1}),
 defaultdict(int,
             {'the': 1,
              'faster': 1,
              'kishor': 1,
              'got': 2,
              'out': 2,
              'again': 1}),
 defaultdict(int, {'the': 1, 'faster': 2, 'kishor': 1, 'got': 1, 'home': 4})]

# Using sklearn

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X =vectorizer.fit_transform(corpus)
print("Shape: ",X.shape)
print(X.todense())

Shape:  (3, 10)
[[0.         0.32464326 0.19173954 0.38347908 0.         0.19173954
  0.         0.64928652 0.38347908 0.32464326]
 [0.36657365 0.         0.2165043  0.43300861 0.         0.2165043
  0.7331473  0.         0.2165043  0.        ]
 [0.         0.         0.27506398 0.13753199 0.93144761 0.13753199
  0.         0.         0.13753199 0.        ]]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)

In [9]:
vectors.toarray()

array([[0, 1, 1, 2, 0, 1, 0, 2, 2, 1],
       [1, 0, 1, 2, 0, 1, 2, 0, 1, 0],
       [0, 0, 2, 1, 4, 1, 0, 0, 1, 0]], dtype=int64)

# Term Frequency Computation

In [10]:
def tf(corpus):
    dic={}
    for document in corpus:
        for word in document.split():
            if word in dic:
                dic[word] = dic[word] + 1
            else:
                dic[word]=1
    for word,freq in dic.items():
        print(word,freq)
        dic[word]=freq/sum(map(len, (document.split() for document in corpus)))
    return dic
tf(corpus)

The 1
faster 4
kishore 3
got 5
to 1
the 3
store 2
and 1
out 2
again 1
home 4


{'The': 0.037037037037037035,
 'faster': 0.14814814814814814,
 'kishore': 0.1111111111111111,
 'got': 0.18518518518518517,
 'to': 0.037037037037037035,
 'the': 0.1111111111111111,
 'store': 0.07407407407407407,
 'and': 0.037037037037037035,
 'out': 0.07407407407407407,
 'again': 0.037037037037037035,
 'home': 0.14814814814814814}

# TF on each document in the corpus

In [11]:
def tf(corpus):
    tfs = []
    for document in corpus:
        dic={}
        for word in document.split():
            if word in dic:
                dic[word]+=1
            else:
                dic[word]=1
        for word,freq in dic.items():
            print(word,freq)
            dic[word]=freq/len(document.split())
        tfs.append(dic)
    return tfs

In [12]:
tf(corpus)

The 1
faster 1
kishore 1
got 2
to 1
the 1
store 2
and 1
the 1
faster 1
kishore 1
got 2
out 2
again 1
the 1
faster 2
kishore 1
got 1
home 4


[{'The': 0.1,
  'faster': 0.1,
  'kishore': 0.1,
  'got': 0.2,
  'to': 0.1,
  'the': 0.1,
  'store': 0.2,
  'and': 0.1},
 {'the': 0.125,
  'faster': 0.125,
  'kishore': 0.125,
  'got': 0.25,
  'out': 0.25,
  'again': 0.125},
 {'the': 0.1111111111111111,
  'faster': 0.2222222222222222,
  'kishore': 0.1111111111111111,
  'got': 0.1111111111111111,
  'home': 0.4444444444444444}]