### Basic TF-IDF Implementation

#### Author: Yifan Wang

In [522]:
import numpy as np
from collections import Counter
from nltk.corpus import stopwords


# TFIDF Algorithm
class TFIDF(object):
    
    def __init__ (self, stopwords):
        self.stopwords = stopwords

    
        
    def tokenizer(self,data):
        '''tokenize text into list of words and Remove SW'''


        data = [x.lower().split() for x in data]

        # Remove Stopwords:
        clean_data = []
        for doc in data:
            clean_data.append([w for w in doc if w not in self.stopwords])
        
        self.data = clean_data
#         print(self.data)
    
    
    def tfidf_word2id(self):
        # Word to Index
        new_data = []
        word2id = {}
        counter = 1
        for doc in self.data:
            new_doc = []
            for tok in doc:
                if tok not in word2id:
                    word2id[tok] = str(counter)
                    counter += 1

                new_doc.append(word2id[tok])
            new_data.append(new_doc)

        self.word2id = word2id
        self.data = new_data

    
    def term_freq(self):
        tf = []
        for doc in self.data:
            doc_count = Counter(doc)
            doc_tf = { x: doc_count[x]/len(doc) for x in doc_count}
            tf.append(doc_tf)
        self.tf  = tf
    
    def inv_doc_freq(self):
        idf = {}
        idxs = list(set([j for i in self.data for j in i]))
        N = len(self.data)
        for idx in idxs:
            nd = len([doc for doc in self.data if idx in doc])
            idf[idx] = np.log10(1+ (N/nd))
        self.idf = idf
        
    def tfidf_raw(self):
        results = []
        for doc in self.tf:
            result = {}
            for idx in doc:
                result[idx] = doc[idx] * self.idf[idx]
            results.append(result)
        self.tfidf_raw_results = results
    
    def tfidf_id2word(self):
        tfidf_results = []
        self.id2word = {v: k for k, v in self.word2id.items()}
        
        for doc in self.tfidf_raw_results:
            res = {}
            for idx in doc:
#                 print(idx)
#                 print(self.id2word[idx])
#                 print(doc[idx])
#                 print('---')
                
                res[self.id2word[idx]] = doc[idx]
            tfidf_results.append(res)
        return tfidf_results
        
    
    def fit(self,X):
        self.tokenizer(X)
        self.tfidf_word2id()
        self.term_freq()
        self.inv_doc_freq()
        self.tfidf_raw()
        


        
        
        

In [523]:
# From: http://www.home-speech-home.com/speech-therapy-sentences.html
# Also made some change to the text for better results
data = [
    'My mom drove me to school after she talk with me',
    'I found a gold coin on the gold school after school today',
    'The church was white and brown and look  like a church',
    'Your mom is so nice she gave me a ride home today',
    'Are you going to have a blue birthday cake for your next birthday',
    'My mom made a milkshake with frozen bananas and chocolate sauce',
    'I got my haircut today and they did it way too short',
    'Your sister is my best friend because she always shares her treats with me',
    'The gum was stuck under the desk',
    'The flowers smelled beautiful and made the room so happy',
    'The dog chased the cat around the block'
    
]

### TFIDF:

In [524]:
# Stopwords:
sw = list(set(stopwords.words('english')))

In [525]:
tfidf = TFIDF(stopwords=sw)
tfidf.fit(data)
res = tfidf.tfidf_id2word()

### Now let's get the top 3 keyword in each document for validation purpose:

In [526]:
for  i in range(len(res)):
    print('Doc {}: \n'.format(i))
    print("{}\n".format(sorted(res[i].items(),key = lambda x: x[1],reverse=True)[:3]))
    print('======================================================================================================')

Doc 0: 

[('drove', 0.2697953115119062), ('talk', 0.2697953115119062), ('school', 0.2032283391607139)]

Doc 1: 

[('gold', 0.30833749887074996), ('school', 0.23226095904081587), ('found', 0.15416874943537498)]

Doc 2: 

[('church', 0.35972708201587494), ('white', 0.17986354100793747), ('brown', 0.17986354100793747)]

Doc 3: 

[('nice', 0.17986354100793747), ('gave', 0.17986354100793747), ('ride', 0.17986354100793747)]

Doc 4: 

[('birthday', 0.35972708201587494), ('going', 0.17986354100793747), ('blue', 0.17986354100793747)]

Doc 5: 

[('milkshake', 0.15416874943537498), ('frozen', 0.15416874943537498), ('bananas', 0.15416874943537498)]

Doc 6: 

[('got', 0.215836249209525), ('haircut', 0.215836249209525), ('way', 0.215836249209525)]

Doc 7: 

[('sister', 0.17986354100793747), ('best', 0.17986354100793747), ('friend', 0.17986354100793747)]

Doc 8: 

[('gum', 0.35972708201587494), ('stuck', 0.35972708201587494), ('desk', 0.35972708201587494)]

Doc 9: 

[('flowers', 0.17986354100793747),

### Reference: https://en.wikipedia.org/wiki/Tf%E2%80%93idf