# TF-IDF (term frequency–inverse document frequency)

- TF(t,d) = $\frac{\mbox{Number of times term t appears in document d}} {\mbox{Total number of terms in the document d}}$



- idf(t) = $log(\frac{\mbox{Number of documents in the corpus}} {\mbox{number of documents where the term t appears}})$

==============

- read set of documents
- count words per document
- comput TF-IDF
- assign TF-IDF to words 


In [4]:
import nltk

# import nltk books
#from nltk.book import *

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag

def preprocessSent(sent, tag = True):
    sent = nltk.word_tokenize(sent)
    if(tag == True):  sent = nltk.pos_tag(sent)
    return sent

def preprocessText(text, tag = True):
# Segment text into sentences
    sent = sent_tokenize(text)
# Tokenize each sentences
    sent = [nltk.word_tokenize(s) for s in sent]
# Part-of-speech tagging each sentences
    if(tag == True) : sent = [nltk.pos_tag(s) for s in sent]
    return sent


In [2]:
from pathlib import Path
import os.path

# Read n files from directory and preprocess
# D[doc][sent][word]

def ReadSourceTok(dic, n=100,  tag = False) :
    D = {}
    i = 0
    for f in sorted(Path(dic).iterdir()):
        print(f.resolve())
        if (i == n): break
        i += 1
        with f.open('r', encoding='utf-8') as fhin:
            data = fhin.read()
        b = os.path.basename(f)
        D.setdefault(b, [])
        D[b].append(preprocessSent(data, tag = tag))
    return D

In [5]:
# each document: a list of words
D = ReadSourceTok("/data/critt/shared/resources/aclImdb/test/pos/", n=10,  tag = False)
print(f"#Docs:{len(D)} #words:{len([w for d in D.keys() for s in D[d] for w in s])}")
print(f"Docs:{D.keys()}") 


/data/critt/shared/resources/aclImdb/test/pos/0_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10000_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10001_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10002_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10003_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10004_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10005_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10006_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10007_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10008_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10009_10.txt
#Docs:10 #words:3619
Docs:dict_keys(['0_10.txt', '10000_7.txt', '10001_9.txt', '10002_8.txt', '10003_8.txt', '10004_9.txt', '10005_8.txt', '10006_7.txt', '10007_10.txt', '10008_8.txt'])


# TF
- 

In [6]:
import numpy as np

# count words in a collection of documents 
# structure D[doc][sent][word]
# return word/document dictionary: 
#   T['///---///'][doc] = number_words_in_doc
#   T[word]['D'] = occurancs_word_in_collection
#   T[word]['d'][doc]['f'] = occurances_word_in_doc
#   T[word]['d'][doc]['tf'] = term frequency : T[word]['d'][doc]['f'] / T["///--///"][doc]


def CountWords(D, T={}):
    for d in D: # d: document
        td = 0 # count numbr of terms (words) in d
        for s in D[d]: # s: sentence
            for w in s: # w: word
                
                # count frequency of term w in documnt d
                T.setdefault(w, {})
                T[w].setdefault('d', {})
                T[w]['d'].setdefault(d, {})
                T[w]['d'][d].setdefault('f', 0)
                
                # increment frequency for d, only onc per document
                T[w].setdefault('D', 0)
                if(T[w]['d'][d]['f'] == 0): T[w]['D'] += 1
                                  
                # increment word count for document
                T[w]['d'][d]['f'] += 1

                # count terms in d
                td += 1

        # count terms in document d
        T.setdefault("///--///", {})
        T["///--///"][d] = td

    # compute tf-idf
    
    n = len(D.keys()) # number of documents    
    for w in T: # d: document
        # Ignore if not a word
        if(w == "///--///"): continue
        for d in T[w]['d']: # d: document
            tf = T[w]['d'][d]['f'] / T["///--///"][d]            
            idf = np.log(n/T[w]['D'])
#            idf = n/T[w]['D'] # non-logarithmic version

            T[w]['idf'] = idf
            T[w]['d'][d]['tf'] = tf
            T[w]['d'][d]['tfidf'] = tf * idf

    return T

def TfIdf(w, T) :
    print(f"{w:<6}\t ifd:{T[w]['idf']:4.4}")
    for d in T[w]['d'] :
        f = T[w]['d'][d]['f']
        tf = T[w]['d'][d]['tf']
        idf = T[w]['idf']
        tfidf = T[w]['d'][d]['tfidf']

        print(f"{d}\tcnt:{f}\t#d:{T[w]['D']}\ttf:{tf:4.4}\ttfidf:{tfidf:4.4}")              
        
             

In [36]:
n = 1000

# document collction D
D = ReadSourceTok("/data/critt/shared/resources/aclImdb/test/pos/", n=n,  tag = False)

# document collection D
T = CountWords(D)
    
# document collection D
TfIdf('the', T)


/data/critt/shared/resources/aclImdb/test/pos/0_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10000_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10001_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10002_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10003_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10004_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10005_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10006_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10007_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10008_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10009_10.txt
/data/critt/shared/resources/aclImdb/test/pos/1000_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10010_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10011_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10012_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10013_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10014_7.txt
/data/critt/shar

/data/critt/shared/resources/aclImdb/test/pos/10161_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10162_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10163_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10164_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10165_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10166_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10167_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10168_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10169_8.txt
/data/critt/shared/resources/aclImdb/test/pos/1016_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10170_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10171_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10172_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10173_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10174_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10175_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10176_9.txt
/data/critt/sh

/data/critt/shared/resources/aclImdb/test/pos/10335_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10336_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10337_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10338_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10339_8.txt
/data/critt/shared/resources/aclImdb/test/pos/1033_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10340_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10341_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10342_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10343_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10344_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10345_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10346_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10347_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10348_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10349_8.txt
/data/critt/shared/resources/aclImdb/test/pos/1034_10.txt
/data/critt/

/data/critt/shared/resources/aclImdb/test/pos/10510_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10511_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10512_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10513_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10514_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10515_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10516_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10517_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10518_7.txt
/data/critt/shared/resources/aclImdb/test/pos/10519_8.txt
/data/critt/shared/resources/aclImdb/test/pos/1051_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10520_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10521_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10522_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10523_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10524_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10525_7.txt
/data/c

/data/critt/shared/resources/aclImdb/test/pos/10694_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10695_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10696_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10697_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10698_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10699_8.txt
/data/critt/shared/resources/aclImdb/test/pos/1069_10.txt
/data/critt/shared/resources/aclImdb/test/pos/106_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10700_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10701_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10702_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10703_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10704_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10705_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10706_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10707_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10708_9.txt
/data/

/data/critt/shared/resources/aclImdb/test/pos/10845_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10846_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10847_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10848_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10849_10.txt
/data/critt/shared/resources/aclImdb/test/pos/1084_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10850_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10851_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10852_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10853_8.txt
/data/critt/shared/resources/aclImdb/test/pos/10854_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10855_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10856_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10857_9.txt
/data/critt/shared/resources/aclImdb/test/pos/10858_10.txt
/data/critt/shared/resources/aclImdb/test/pos/10859_9.txt
/data/critt/shared/resources/aclImdb/test/pos/1085_9.txt
/data/c

### Toy example

In [44]:
# Document collection
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

C = {}
i =1
for d in docs:
    C.setdefault(i, [])
    C[i].append(preprocessSent(d, tag = False))
    i += 1
   
# document collection D
Toy = CountWords(C, T={})
 

# document collection D
for d in C:
    for s in C[d] :
        for w in s:
            TfIdf(w, Toy) 
        
    
    

the   	 ifd: 0.0
1	cnt:1	#d:5	tf:0.1429	tfidf: 0.0
2	cnt:2	#d:5	tf: 0.4	tfidf: 0.0
3	cnt:2	#d:5	tf:0.2857	tfidf: 0.0
4	cnt:2	#d:5	tf:0.3333	tfidf: 0.0
5	cnt:2	#d:5	tf:0.3333	tfidf: 0.0
house 	 ifd:0.9163
1	cnt:1	#d:2	tf:0.1429	tfidf:0.1309
3	cnt:1	#d:2	tf:0.1429	tfidf:0.1309
had   	 ifd:1.609
1	cnt:1	#d:1	tf:0.1429	tfidf:0.2299
a     	 ifd:1.609
1	cnt:1	#d:1	tf:0.1429	tfidf:0.2299
tiny  	 ifd:1.609
1	cnt:1	#d:1	tf:0.1429	tfidf:0.2299
little	 ifd:1.609
1	cnt:1	#d:1	tf:0.1429	tfidf:0.2299
mouse 	 ifd: 0.0
1	cnt:1	#d:5	tf:0.1429	tfidf: 0.0
2	cnt:1	#d:5	tf: 0.2	tfidf: 0.0
3	cnt:1	#d:5	tf:0.1429	tfidf: 0.0
4	cnt:1	#d:5	tf:0.1667	tfidf: 0.0
5	cnt:1	#d:5	tf:0.1667	tfidf: 0.0
the   	 ifd: 0.0
1	cnt:1	#d:5	tf:0.1429	tfidf: 0.0
2	cnt:2	#d:5	tf: 0.4	tfidf: 0.0
3	cnt:2	#d:5	tf:0.2857	tfidf: 0.0
4	cnt:2	#d:5	tf:0.3333	tfidf: 0.0
5	cnt:2	#d:5	tf:0.3333	tfidf: 0.0
cat   	 ifd:0.9163
2	cnt:1	#d:2	tf: 0.2	tfidf:0.1833
4	cnt:1	#d:2	tf:0.1667	tfidf:0.1527
saw   	 ifd:1.609
2	cnt:1	#d:1	tf: 0.2	tfidf:0.32

In [43]:
# Task:
# sort TF-IDF 
# extract top 5 words

D = ReadSourceTok("/users/kent/slee122/code/data/text_file/", n=4,  tag = False)
D

/users/kent/slee122/code/data/text_file/0_10.txt
/users/kent/slee122/code/data/text_file/10000_7.txt
/users/kent/slee122/code/data/text_file/10001_9.txt
/users/kent/slee122/code/data/text_file/10002_8.txt


{'0_10.txt': [['I',
   'went',
   'and',
   'saw',
   'this',
   'movie',
   'last',
   'night',
   'after',
   'being',
   'coaxed',
   'to',
   'by',
   'a',
   'few',
   'friends',
   'of',
   'mine',
   '.',
   'I',
   "'ll",
   'admit',
   'that',
   'I',
   'was',
   'reluctant',
   'to',
   'see',
   'it',
   'because',
   'from',
   'what',
   'I',
   'knew',
   'of',
   'Ashton',
   'Kutcher',
   'he',
   'was',
   'only',
   'able',
   'to',
   'do',
   'comedy',
   '.',
   'I',
   'was',
   'wrong',
   '.',
   'Kutcher',
   'played',
   'the',
   'character',
   'of',
   'Jake',
   'Fischer',
   'very',
   'well',
   ',',
   'and',
   'Kevin',
   'Costner',
   'played',
   'Ben',
   'Randall',
   'with',
   'such',
   'professionalism',
   '.',
   'The',
   'sign',
   'of',
   'a',
   'good',
   'movie',
   'is',
   'that',
   'it',
   'can',
   'toy',
   'with',
   'our',
   'emotions',
   '.',
   'This',
   'one',
   'did',
   'exactly',
   'that',
   '.',
   'The',
   'en

In [35]:
T = CountWords(D)
def TfIdf(w, T) :
    print(f"{w:<6}\t ifd:{T[w]['idf']:4.4}")
    for d in T[w]['d'] :
        f = T[w]['d'][d]['f']
        tf = T[w]['d'][d]['tf']
        idf = T[w]['idf']
        tfidf = T[w]['d'][d]['tfidf']

        print(f"{d}\tcnt:{f}\t#d:{T[w]['D']}\ttf:{tf:4.4}\ttfidf:{tfidf:4.4}")   

In [None]:
key = list(D.keys())
#print(key[0])
result_1 = []
result_2 = []
result_3 = []
result_4 = []
for i in key:
    for j in D[i][0]:
        if i == '0_10.txt':
            result_1.append([j, tfidf(j,D[i][0],D[i])])#t-str,d-list,D-2d list
        elif i == '10000_7.txt':
            result_2.append([j, tfidf(j,D[i][0],D[i])])#t-str,d-list,D-2d list
        elif i == '10001_9.txt':
            result_3.append([j, tfidf(j,D[i][0],D[i])])#t-str,d-list,D-2d list
        elif i == '10002_8.txt':
            result_4.append([j, tfidf(j,D[i][0],D[i])])#t-str,d-list,D-2d list

#result = [result_1, result_2, result_3, result_4]
print(result_4)
#sorted_result_0 = sorted(result_1, key=lambda x: x[1], reverse=True)
#print(sorted_result_0)
