# TF-IDF Example Day 21
#### Credit goes to https://www.youtube.com/watch?v=hXNbFNCgPfY

In [1]:
# Lets start with a brief corpus of documents. A corpus is collection.
docA = "the cat sat on my face"
docB = "the dog sat on my bed"


### Tokenizing- bag of words
Splitting a document up into the component like below is called 'tokenizing'



In [2]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [3]:
bowB

['the', 'dog', 'sat', 'on', 'my', 'bed']

How do we convert a tockenized BOW into numbers?
Two steps
1. create a vector of all possibles words
2. count how many times each word appears for each document

In [4]:
wordSet = set(bowA).union(set(bowB))

In [6]:
# all words in all bags/documents
wordSet

{'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat', 'the'}

In [7]:
# create dictionaries to keep my word counts
wordDictA = dict.fromkeys(wordSet,0)
wordDictB = dict.fromkeys(wordSet,0)

In [8]:
wordDictA

{'bed': 0, 'cat': 0, 'dog': 0, 'face': 0, 'my': 0, 'on': 0, 'sat': 0, 'the': 0}

In [11]:
# count the words in my bags
for word in bowA:
    wordDictA[word] +=1
    
for word in bowB:
    wordDictB[word] +=1

In [12]:
wordDictA

{'bed': 0, 'cat': 1, 'dog': 0, 'face': 1, 'my': 1, 'on': 1, 'sat': 1, 'the': 1}

In [14]:
# Lastly I'll stick those into a matrix
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,bed,cat,dog,face,my,on,sat,the
0,0,1,0,1,1,1,1,1
1,1,0,1,0,1,1,1,1


Boom! We just converted words into a linear algebra problem!

In [17]:
def computeTF(wordDict, bow):
    tfDict ={}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict

In [18]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [24]:
def computerIDF(docList):
    import math
    idfDIct = {}
    N = len(docList)
    
    #counts the number of documents that contain a word w
    idfDict = dict.fromkeys(docList[0].keys(),0)
    for doc in docList:
        for word, val in doc.items():
            if val>0:
                idfDict[word] +=1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
        
    return idfDict

In [25]:
idfs = computerIDF([wordDictA, wordDictB])

In [28]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
        
    return tfidf

In [29]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [30]:
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,bed,cat,dog,face,my,on,sat,the
0,0.0,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0
1,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0,0.0
