In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
          'KDnuggets Collection of data science Projects',
          '3 Free Statistics Courses for data science data',
          'Parallel Processing Large File in Python',
          '15 You  Python Coding Interview Questions You Must Know For data science',
 ]

vectorizer = TfidfVectorizer()

# TD-IDF Matrix
X = vectorizer.fit_transform(corpus)

# extracting feature names
tfidf_tokens = vectorizer.get_feature_names_out()

In [23]:
import pandas as pd

result = pd.DataFrame(
    data=X.toarray(), 
    index=["Doc1", "Doc2", "Doc3", "Doc4"], 
    columns=tfidf_tokens
)

result

Unnamed: 0,15,coding,collection,courses,data,file,for,free,in,interview,...,must,of,parallel,processing,projects,python,questions,science,statistics,you
Doc1,0.0,0.0,0.455732,0.0,0.290888,0.0,0.0,0.0,0.0,0.0,...,0.0,0.455732,0.0,0.0,0.455732,0.0,0.0,0.290888,0.0,0.0
Doc2,0.0,0.0,0.0,0.420382,0.536648,0.0,0.331434,0.420382,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.268324,0.420382,0.0
Doc3,0.0,0.0,0.0,0.0,0.0,0.421765,0.0,0.0,0.421765,0.0,...,0.0,0.0,0.421765,0.421765,0.0,0.332524,0.0,0.0,0.0,0.0
Doc4,0.28798,0.28798,0.0,0.0,0.183814,0.0,0.227047,0.0,0.0,0.28798,...,0.28798,0.0,0.0,0.0,0.0,0.227047,0.28798,0.183814,0.0,0.57596


In [24]:
dt = pd.DataFrame(corpus, columns=["data"])

dt.head()

Unnamed: 0,data
0,KDnuggets Collection of data science Projects
1,3 Free Statistics Courses for data science data
2,Parallel Processing Large File in Python
3,15 You Python Coding Interview Questions You ...


# Menghitung TF, 

tf(i, j) = f_d(i) / max f_d(j)

Term Frequency merupakan frekuensi kemunculan term i pada dokumen j dibagi dengan total term pada dokumen j.

In [25]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

test = dt["data"].apply(lambda x: x.split()).apply(calc_TF)
for e in test:
    print(e)

{'KDnuggets': 0.16666666666666666, 'Collection': 0.16666666666666666, 'of': 0.16666666666666666, 'data': 0.16666666666666666, 'science': 0.16666666666666666, 'Projects': 0.16666666666666666}
{'3': 0.125, 'Free': 0.125, 'Statistics': 0.125, 'Courses': 0.125, 'for': 0.125, 'data': 0.25, 'science': 0.125}
{'Parallel': 0.16666666666666666, 'Processing': 0.16666666666666666, 'Large': 0.16666666666666666, 'File': 0.16666666666666666, 'in': 0.16666666666666666, 'Python': 0.16666666666666666}
{'15': 0.08333333333333333, 'You': 0.16666666666666666, 'Python': 0.08333333333333333, 'Coding': 0.08333333333333333, 'Interview': 0.08333333333333333, 'Questions': 0.08333333333333333, 'Must': 0.08333333333333333, 'Know': 0.08333333333333333, 'For': 0.08333333333333333, 'data': 0.08333333333333333, 'science': 0.08333333333333333}


# Menghitung IDF,

N adalah jumlah total document dalam corpus, N =|D|.

|{d ∈ D : t ∈ d}| = df(t) , adalah jumlah dokumen yang mengandung term t. IDF juga dapat dituliskan dalam bentuk Penambahan 1 untuk menghindari pembagian terhadap 0 jika df(t) tidak ditemukan pada corpus.

In [26]:
import numpy as np


def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

n_document = len(dt["data"])

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict

DF = calc_DF(dt["data"].apply(lambda x: x.split()))
IDF = calc_IDF(n_document, DF)

DF

{'KDnuggets': 1,
 'Collection': 1,
 'of': 1,
 'data': 4,
 'science': 3,
 'Projects': 1,
 '3': 1,
 'Free': 1,
 'Statistics': 1,
 'Courses': 1,
 'for': 1,
 'Parallel': 1,
 'Processing': 1,
 'Large': 1,
 'File': 1,
 'in': 1,
 'Python': 2,
 '15': 1,
 'You': 2,
 'Coding': 1,
 'Interview': 1,
 'Questions': 1,
 'Must': 1,
 'Know': 1,
 'For': 1}

In [29]:
IDF
# n_document

{'KDnuggets': 0.6931471805599453,
 'Collection': 0.6931471805599453,
 'of': 0.6931471805599453,
 'data': -0.2231435513142097,
 'science': 0.0,
 'Projects': 0.6931471805599453,
 '3': 0.6931471805599453,
 'Free': 0.6931471805599453,
 'Statistics': 0.6931471805599453,
 'Courses': 0.6931471805599453,
 'for': 0.6931471805599453,
 'Parallel': 0.6931471805599453,
 'Processing': 0.6931471805599453,
 'Large': 0.6931471805599453,
 'File': 0.6931471805599453,
 'in': 0.6931471805599453,
 'Python': 0.28768207245178085,
 '15': 0.6931471805599453,
 'You': 0.28768207245178085,
 'Coding': 0.6931471805599453,
 'Interview': 0.6931471805599453,
 'Questions': 0.6931471805599453,
 'Must': 0.6931471805599453,
 'Know': 0.6931471805599453,
 'For': 0.6931471805599453}