In [15]:
from operator import itemgetter

In [16]:
class TFIDF():
    def __init__(self,doc):
        self.doc = [x.split() for x in doc]
        self.wordSet = self.getWordset(self.doc)
        self.wordDict = self.getWorddict(self.doc)
        self.idf = self.computeIDF(self.wordDict)
        self.tfidf = []

    def getWordset(self,doc):
        wordSet = []
        for x in doc:
            wordSet = wordSet + x
        return set(wordSet)

    def getWorddict(self,doc):
        wordDict = []
        for i in range(0,len(doc)):
            wordDict.append(dict.fromkeys(self.wordSet, 0))
        for i in range(0,len(wordDict)):
            for word in doc[i]:
                wordDict[i][word]+=1
        return wordDict

    def computeIDF(self,docList):
        import math
        idfDict = {}
        N = len(docList)

        idfDict = dict.fromkeys(docList[0].keys(), 0)
        for doc in docList:
            for word, val in doc.items():
                if val > 0:
                    idfDict[word] += 1

        for word, val in idfDict.items():
            idfDict[word] = math.log10(N / float(val))

        return idfDict    
    
    def computeTFIDF(self, i, idfs):
        tfidf = {}
        for word in self.doc[i]:
            tfidf[word] = self.wordDict[i][word]*idfs[word]
        return tfidf
    
    def getTFIDF(self):
        idfs = self.idf
        tfidf_out = []
        for x in range(0,len(self.doc)):
            tfidf_out.append(self.computeTFIDF(x, idfs))
        self.tfidf = tfidf_out
        return list(tfidf_out)

    def getSCORE(self):
        score = []
        for u in range(0,len(self.tfidf)):
            score.append(0)
            for i in self.tfidf[u]:
                score[u] += self.tfidf[u][i]
        return score
    
    def getIDF(self):
        return self.idf

    def getWORDSET(self):
        return self.wordSet
    
    def getWORDDICT(self):
        return self.wordDict
    
    def search(self,query):
        query_tfidf = []

        num = 0
        tfidf_out = self.getTFIDF()
        for i in tfidf_out:
            query_tfidf.append(0.0)
            for u in i:
                if u in query:
                    query_tfidf[num] += i[u]
            num = num + 1
        
        hasil = [a*b for a,b in zip(query_tfidf,self.getSCORE())]
        for i in range(0,len(hasil)):
            hasil[i] = [i, hasil[i]]
        
        hasil_akhir = reversed(sorted(hasil, key=itemgetter(1)))
        hasil_akhir = [x for x in hasil_akhir]
        
        return hasil_akhir
        
dataset = ["Herbal Formulation Against Dental Caries Causing Microorganisms Using Extracts of Stevia Rebaudiana Leaves (A Natural Sweetner)","MicroRNAs and cancer resistance: A new molecular plot"] 
tfidf = TFIDF(dataset)

In [23]:
query = "molecular pathogenesis micrornas"
tfidf.search(query)

[[1, 0.7249524663156524], [0, 0.0]]

In [18]:
tfidf.getTFIDF()

[{'Herbal': 0.3010299956639812,
  'Formulation': 0.3010299956639812,
  'Against': 0.3010299956639812,
  'Dental': 0.3010299956639812,
  'Caries': 0.3010299956639812,
  'Causing': 0.3010299956639812,
  'Microorganisms': 0.3010299956639812,
  'Using': 0.3010299956639812,
  'Extracts': 0.3010299956639812,
  'of': 0.3010299956639812,
  'Stevia': 0.3010299956639812,
  'Rebaudiana': 0.3010299956639812,
  'Leaves': 0.3010299956639812,
  '(A': 0.3010299956639812,
  'Natural': 0.3010299956639812,
  'Sweetner)': 0.3010299956639812},
 {'MicroRNAs': 0.3010299956639812,
  'and': 0.3010299956639812,
  'cancer': 0.3010299956639812,
  'resistance:': 0.3010299956639812,
  'A': 0.3010299956639812,
  'new': 0.3010299956639812,
  'molecular': 0.3010299956639812,
  'plot': 0.3010299956639812}]

In [19]:
tfidf.getSCORE()

[4.816479930623699, 2.4082399653118496]

In [20]:
tfidf.getIDF()

{'new': 0.3010299956639812,
 'A': 0.3010299956639812,
 'Using': 0.3010299956639812,
 'and': 0.3010299956639812,
 'cancer': 0.3010299956639812,
 'Leaves': 0.3010299956639812,
 'Herbal': 0.3010299956639812,
 'Sweetner)': 0.3010299956639812,
 '(A': 0.3010299956639812,
 'resistance:': 0.3010299956639812,
 'Caries': 0.3010299956639812,
 'Causing': 0.3010299956639812,
 'molecular': 0.3010299956639812,
 'Natural': 0.3010299956639812,
 'Formulation': 0.3010299956639812,
 'Extracts': 0.3010299956639812,
 'Microorganisms': 0.3010299956639812,
 'plot': 0.3010299956639812,
 'Dental': 0.3010299956639812,
 'of': 0.3010299956639812,
 'Against': 0.3010299956639812,
 'Stevia': 0.3010299956639812,
 'MicroRNAs': 0.3010299956639812,
 'Rebaudiana': 0.3010299956639812}

In [21]:
tfidf.getWORDSET()

{'(A',
 'A',
 'Against',
 'Caries',
 'Causing',
 'Dental',
 'Extracts',
 'Formulation',
 'Herbal',
 'Leaves',
 'MicroRNAs',
 'Microorganisms',
 'Natural',
 'Rebaudiana',
 'Stevia',
 'Sweetner)',
 'Using',
 'and',
 'cancer',
 'molecular',
 'new',
 'of',
 'plot',
 'resistance:'}

In [22]:
tfidf.getWORDDICT()

[{'new': 0,
  'A': 0,
  'Using': 1,
  'and': 0,
  'cancer': 0,
  'Leaves': 1,
  'Herbal': 1,
  'Sweetner)': 1,
  '(A': 1,
  'resistance:': 0,
  'Caries': 1,
  'Causing': 1,
  'molecular': 0,
  'Natural': 1,
  'Formulation': 1,
  'Extracts': 1,
  'Microorganisms': 1,
  'plot': 0,
  'Dental': 1,
  'of': 1,
  'Against': 1,
  'Stevia': 1,
  'MicroRNAs': 0,
  'Rebaudiana': 1},
 {'new': 1,
  'A': 1,
  'Using': 0,
  'and': 1,
  'cancer': 1,
  'Leaves': 0,
  'Herbal': 0,
  'Sweetner)': 0,
  '(A': 0,
  'resistance:': 1,
  'Caries': 0,
  'Causing': 0,
  'molecular': 1,
  'Natural': 0,
  'Formulation': 0,
  'Extracts': 0,
  'Microorganisms': 0,
  'plot': 1,
  'Dental': 0,
  'of': 0,
  'Against': 0,
  'Stevia': 0,
  'MicroRNAs': 1,
  'Rebaudiana': 0}]