In [1]:
import numpy as np
import pandas as pd

In [2]:
def bag_of_words(texts: list[str]):
    """ For each word, save the number of word usage in each sentence (word-count) """

    words = []
    for sentence in texts:
        words.extend(sentence.split())

    matrix = pd.DataFrame(columns=words, index=range(len(texts)))
    for i, sentence in enumerate(texts):
        sentence_words = sentence.split()

        for word in words:
            matrix.loc[i, word] = sentence_words.count(word)

    return matrix

In [3]:
def term_frequency(texts: list[str]):
    """ For each word, save the density of word (word-usage / all-words) """

    words = []
    for sentence in texts:
        words.extend(sentence.split())

    matrix = pd.DataFrame(columns=words, index=range(len(texts)))
    for i, sentence in enumerate(texts):
        sentence_words = sentence.split()

        for word in words:
            matrix.loc[i, word] = sentence_words.count(word) / len(sentence_words)

    return matrix

In [4]:
def invert_document_frequency(texts: list[str]):
    """ For each word, save (all-sentences / consist-sentences) """

    def count_in_texts(clause):
        return sum(clause in sentence.split() for sentence in texts)

    words = []
    for sentence in texts:
        words.extend(sentence.split())

    matrix = pd.DataFrame(columns=words, index=range(len(texts)))
    for i, sentence in enumerate(texts):
        sentence_words = sentence.split()

        for word in words:
            matrix.loc[i, word] = len(texts) / count_in_texts(word)

    return matrix

In [5]:
def TF_IDF(texts):
    return term_frequency(texts) * invert_document_frequency(texts)

In [6]:
texts = [
    'Learning is my best entertainment',
    'Sorry , I could not do this better',
    'I am not really depressed',
    'I am depressed , as always'
]

In [7]:
TF_IDF(texts)

Unnamed: 0,Learning,is,my,best,entertainment,Sorry,",",I,could,not,...,am,not.1,really,depressed,I.1,am.1,depressed.1,",.1",as,always
0,0.8,0.8,0.8,0.8,0.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.5,0.25,0.166667,0.5,0.25,...,0.0,0.25,0.0,0.0,0.166667,0.0,0.0,0.25,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266667,0.0,0.4,...,0.4,0.4,0.8,0.4,0.266667,0.4,0.4,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.222222,0.0,0.0,...,0.333333,0.0,0.0,0.333333,0.222222,0.333333,0.333333,0.333333,0.666667,0.666667
