### Count Vectorization
- Create a vocab of all the words in the document
- Count how many times a word shows up in a document
- Forms a document Term Matrix (DTM); Stored as a sparse matrix to save space on the machine
- don't store common filer words ("a", "the", etc). or words common to a corpus of documents
### Term Frequency - Inverse Document Frequency (TF-IDF)
- TF - count how often a word shows up in a document
- IDF - Counting how unique some words are to a document; diminshs the weight of words which shows up very often
- IDF: log((docs containing the word)/N)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
with open("One.txt") as text:
    words_one = text.read().lower().split()
    uni_words_one = set(words_one)

In [13]:
uni_words_one

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'dogs',
 'furry',
 'is',
 'our',
 'pets',
 'story',
 'this'}

In [14]:
with open("Two.txt") as text:
    words_two = text.read().lower().split()
    uni_words_two = set(words_two)

In [15]:
uni_words_two

{'a',
 'about',
 'catching',
 'fun',
 'is',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [16]:
all_unique_words = uni_words_one

In [17]:
all_unique_words.update(uni_words_two)

In [18]:
all_unique_words

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'catching',
 'dogs',
 'fun',
 'furry',
 'is',
 'our',
 'pets',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [19]:
full_vocab = dict()
i = 0

for word in all_unique_words:
    full_vocab[word] = i
    i = i+1
    
full_vocab

{'animals': 0,
 'dogs': 1,
 'waves': 2,
 'sport': 3,
 'are': 4,
 'pets': 5,
 'our': 6,
 'canine': 7,
 'surfing': 8,
 'furry': 9,
 'about': 10,
 'catching': 11,
 'this': 12,
 'story': 13,
 'a': 14,
 'popular': 15,
 'fun': 16,
 'water': 17,
 'is': 18}

In [20]:
one_frequency = [0]*len(full_vocab)
two_frequency = [0]*len(full_vocab)
all_words = [0]*len(full_vocab)

In [21]:
with open("One.txt") as f:
    one_text = f.read().lower().split()

In [26]:
one_frequency = [0]*len(full_vocab)

for word in one_text:
    word_ind = full_vocab[word]
    one_frequency[word_ind] += 1

In [23]:
one_frequency

[1, 2, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1]

In [24]:
with open("Two.txt") as f:
    two_text = f.read().lower().split()

In [27]:
for word in two_text:
    word_ind = full_vocab[word]
    two_frequency[word_ind] += 1

In [28]:
two_frequency

[0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3]

In [29]:
for word in full_vocab:
    word_ind = full_vocab[word]
    all_words[word_ind] = word

In [30]:
all_words

['animals',
 'dogs',
 'waves',
 'sport',
 'are',
 'pets',
 'our',
 'canine',
 'surfing',
 'furry',
 'about',
 'catching',
 'this',
 'story',
 'a',
 'popular',
 'fun',
 'water',
 'is']

In [33]:
bow = pd.DataFrame(data=[one_frequency, two_frequency], columns=all_words)

In [34]:
bow

Unnamed: 0,animals,dogs,waves,sport,are,pets,our,canine,surfing,furry,about,catching,this,story,a,popular,fun,water,is
0,1,2,0,0,1,1,1,1,0,1,1,0,1,1,1,0,0,0,1
1,0,0,1,1,0,0,0,0,2,0,1,1,1,1,1,1,1,1,3


In [36]:
text = ["This is a line",
       "this is another line", 
       "Completely different line"]

In [53]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [68]:
cv = CountVectorizer(stop_words="english")

In [69]:
sparse_text = cv.fit_transform(text)

In [70]:
sparse_text.todense()

matrix([[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1]])

In [71]:
cv.vocabulary_

{'line': 2, 'completely': 0, 'different': 1}

In [72]:
tfidf = TfidfTransformer()

In [73]:
results = tfidf.fit_transform(sparse_text)

In [74]:
results

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [75]:
results.todense()

matrix([[0.        , 0.        , 1.        ],
        [0.        , 0.        , 1.        ],
        [0.65249088, 0.65249088, 0.38537163]])

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [77]:
tv = TfidfVectorizer()

In [78]:
res = tv.fit_transform(text)

In [79]:
res.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])