In [1]:
docA = "The car is driven on the road."
docB = "The truck is driven on the highway."

In [2]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [3]:
print(bowA)

['The', 'car', 'is', 'driven', 'on', 'the', 'road.']


In [4]:
print(bowB)

['The', 'truck', 'is', 'driven', 'on', 'the', 'highway.']


In [5]:
# this is your total unique words in the corpus
wordSet = set(bowA).union(set(bowB))

In [6]:
print(wordSet)

{'road.', 'on', 'driven', 'truck', 'is', 'highway.', 'The', 'car', 'the'}


In [7]:
len(wordSet)

9

In [8]:
wordDictA = dict.fromkeys(wordSet, 0)
print(wordDictA)

{'road.': 0, 'on': 0, 'driven': 0, 'truck': 0, 'is': 0, 'highway.': 0, 'The': 0, 'car': 0, 'the': 0}


In [9]:
wordDictB = dict.fromkeys(wordSet, 0)
print(wordDictB)

{'road.': 0, 'on': 0, 'driven': 0, 'truck': 0, 'is': 0, 'highway.': 0, 'The': 0, 'car': 0, 'the': 0}


In [10]:
# calculating each words frequency/occurrence in a document
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [11]:
print(wordDictA)

{'road.': 1, 'on': 1, 'driven': 1, 'truck': 0, 'is': 1, 'highway.': 0, 'The': 1, 'car': 1, 'the': 1}


In [12]:
print(wordDictB)

{'road.': 0, 'on': 1, 'driven': 1, 'truck': 1, 'is': 1, 'highway.': 1, 'The': 1, 'car': 0, 'the': 1}


In [13]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,The,car,driven,highway.,is,on,road.,the,truck
0,1,1,1,0,1,1,1,1,0
1,1,0,1,1,1,1,0,1,1


In [14]:
# TF - ratio of number of times the word appears(occurrence) in a document compared to 
# the total number of words in that document. 
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow) # total number of words in a document
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [15]:
tfBowA = computeTF(wordDictA, bowA)

In [16]:
print(tfBowA)

{'road.': 0.14285714285714285, 'on': 0.14285714285714285, 'driven': 0.14285714285714285, 'truck': 0.0, 'is': 0.14285714285714285, 'highway.': 0.0, 'The': 0.14285714285714285, 'car': 0.14285714285714285, 'the': 0.14285714285714285}


In [17]:
tfBowB = computeTF(wordDictB, bowB)
print(tfBowB)

{'road.': 0.0, 'on': 0.14285714285714285, 'driven': 0.14285714285714285, 'truck': 0.14285714285714285, 'is': 0.14285714285714285, 'highway.': 0.14285714285714285, 'The': 0.14285714285714285, 'car': 0.0, 'the': 0.14285714285714285}


In [18]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [19]:
idfs = computeIDF([wordDictA, wordDictB])

In [20]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [21]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
print(tfidfBowA)

{'road.': 0.043004285094854454, 'on': 0.0, 'driven': 0.0, 'truck': 0.0, 'is': 0.0, 'highway.': 0.0, 'The': 0.0, 'car': 0.043004285094854454, 'the': 0.0}


In [22]:
tfidfBowB = computeTFIDF(tfBowB, idfs)
print(tfidfBowB)

{'road.': 0.0, 'on': 0.0, 'driven': 0.0, 'truck': 0.043004285094854454, 'is': 0.0, 'highway.': 0.043004285094854454, 'The': 0.0, 'car': 0.0, 'the': 0.0}


In [23]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,The,car,driven,highway.,is,on,road.,the,truck
0,0.0,0.043004,0.0,0.0,0.0,0.0,0.043004,0.0,0.0
1,0.0,0.0,0.0,0.043004,0.0,0.0,0.0,0.0,0.043004


## Using sklearn :

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer()
response = vectorizer.fit_transform([docA, docB])

In [28]:
print(response)

  (0, 6)	0.6043795515372431
  (0, 0)	0.42471718586982765
  (0, 3)	0.30218977576862155
  (0, 1)	0.30218977576862155
  (0, 4)	0.30218977576862155
  (0, 5)	0.42471718586982765
  (1, 6)	0.6043795515372431
  (1, 3)	0.30218977576862155
  (1, 1)	0.30218977576862155
  (1, 4)	0.30218977576862155
  (1, 7)	0.42471718586982765
  (1, 2)	0.42471718586982765


In [29]:
# Show tf-idf feature matrix
vectorizer.get_feature_names()

['car', 'driven', 'highway', 'is', 'on', 'road', 'the', 'truck']

In [30]:
# Create data frame
pd.DataFrame(response.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,car,driven,highway,is,on,road,the,truck
0,0.424717,0.30219,0.0,0.30219,0.30219,0.424717,0.60438,0.0
1,0.0,0.30219,0.424717,0.30219,0.30219,0.0,0.60438,0.424717


## Example :

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
import operator

corpus=["this car got the excellence award",\
         "good car gives good mileage",\
         "this car is very expensive",\
         "the company is growing with very high production",\
         "this company is financially good"]

vocabulary = set()

for doc in corpus:
    vocabulary.update(doc.split())
    
vocabulary = list(vocabulary)

tfidf = TfidfVectorizer(vocabulary=vocabulary)

# Fit the TfIdf model
tfidf.fit(corpus)
tfidf.transform(corpus)

for doc in corpus:
    score={}
    print(doc)
    # Transform a document into TfIdf coordinates
    X = tfidf.transform([doc])
    for word in doc.split():
        score[word] = X[0, tfidf.vocabulary_[word]]
    sortedscore = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
    print(sortedscore)

this car got the excellence award
[('got', 0.4689132131547637), ('excellence', 0.4689132131547637), ('award', 0.4689132131547637), ('the', 0.3783162278555838), ('this', 0.3140366438234139), ('car', 0.3140366438234139)]
good car gives good mileage
[('good', 0.7178821805115433), ('gives', 0.4448982295027494), ('mileage', 0.4448982295027494), ('car', 0.2979535293877717)]
this car is very expensive
[('expensive', 0.5776914793752232), ('very', 0.4660778481185906), ('this', 0.38688671647327205), ('car', 0.38688671647327205), ('is', 0.38688671647327205)]
the company is growing with very high production
[('growing', 0.39524574252810757), ('with', 0.39524574252810757), ('high', 0.39524574252810757), ('production', 0.39524574252810757), ('the', 0.31888177640211135), ('company', 0.31888177640211135), ('very', 0.31888177640211135), ('is', 0.26470068018333703)]
this company is financially good
[('financially', 0.5591166343026757), ('company', 0.4510917800707943), ('good', 0.4510917800707943), ('thi