## Testing for TF-IDF from Scratch

In [None]:
import pandas as pd
import os

In [9]:
synth_df = pd.read_csv(os.path.join(os.getcwd(), "../data/synthetic_data.csv"), index_col = 0)
synth_df


Unnamed: 0,id,category,text
0,1,Product Description,Experience unparalleled sound quality with the...
1,2,Movie Synopsis,"In a world ravaged by climate change, a group ..."
2,3,News Article,The city council approved the new public trans...
3,4,Recipe,"Preheat the oven to 375°F. Mix flour, sugar, a..."
4,5,Travel Guide,"Discover the hidden gems of Kyoto, from tranqu..."
5,6,Scientific Abstract,This study investigates the effects of micropl...
6,7,Book Review,"An evocative tale of love and loss, 'The Silen..."
7,8,Job Posting,Looking for a skilled software engineer profic...
8,9,User Manual,"To reset your device, hold the power button fo..."
9,10,Historical Event,"The Berlin Wall, constructed in 1961, symboliz..."


## Bag of words

In [10]:
import re
from typing import List

def tokenize(text:str) -> List[str]:
    # Remove punctuation using regex, keeping words and numbers
    cleaned_text = re.sub(r"[^\w\s]", "", text)
    # Split the cleaned text into words
    tokens = cleaned_text.lower().split()
    return tokens

In [86]:
texts = synth_df["text"].tolist()

cleaned_texts_list = []

for text in texts:
    #print(text)
    cleaned_text = tokenize(text)
    #print(cleaned_text)
    cleaned_texts_list.append(cleaned_text)
    
cleaned_texts_list

cleaned_sentences_list = []
for text_list in cleaned_texts_list:
    sentence_list = " ".join(text_list)
    cleaned_sentences_list.append(sentence_list)
cleaned_sentences_list

cleaned_texts_list_flattened = [word for sentence in cleaned_texts_list for word in sentence]

texts_corpus = set(cleaned_texts_list_flattened)

print(f"Cleaned text data length: {len(cleaned_texts_list)} \nTexts_corpus length: {len(texts_corpus)}")



Cleaned text data length: 15 
Texts_corpus length: 227


In [108]:
word_counts_texts = {}
for sentence in cleaned_sentences_list:
    word_counts_texts[sentence] = dict.fromkeys(texts_corpus,0)
    
for sentence in cleaned_texts_list:
    key = " ".join(sentence)
    #print(sentence)
    for word in sentence:
        word_counts_texts[key][word]+=1     
    
word_frequencies = pd.DataFrame(word_counts_texts)

## TF

In [110]:
def tf(word_counts: dict, document: list[str]) -> dict:
    """Calculate term frequency of each word in a document."""

    tf_dict = {}
    document_corpus_count = len(document)

    for word, count in word_counts.items():
        tf_dict[word] = count / float(document_corpus_count)

    return tf_dict


tf(word_frequencies[cleaned_sentences_list[0]], cleaned_sentences_list[0])

{'yesterday': 0.0,
 'it': 0.0,
 'producing': 0.0,
 'eggs': 0.0,
 'waterproof': 0.0,
 'plan': 0.0,
 'aiming': 0.0,
 'does': 0.0,
 'b': 0.0,
 'transportation': 0.0,
 'power': 0.0,
 'coral': 0.0,
 'my': 0.0,
 'from': 0.0,
 'new': 0.0,
 'process': 0.0,
 'battery': 0.005952380952380952,
 'climate': 0.0,
 'symbolized': 0.0,
 'conditions': 0.0,
 'blender': 0.0,
 'fresh': 0.0,
 'button': 0.0,
 'with': 0.005952380952380952,
 'division': 0.0,
 'a': 0.0,
 'study': 0.0,
 'horizon': 0.0,
 'life': 0.005952380952380952,
 'public': 0.0,
 'human': 0.0,
 'exceeded': 0.0,
 'terms': 0.0,
 'only': 0.0,
 'sound': 0.005952380952380952,
 'revealing': 0.0,
 'before': 0.0,
 'breathable': 0.0,
 'in': 0.0,
 'your': 0.0,
 'soups': 0.0,
 'reunification': 0.0,
 'japanese': 0.0,
 'device': 0.0,
 'software': 0.0,
 '12hour': 0.005952380952380952,
 'experience': 0.005952380952380952,
 'significant': 0.0,
 'wait': 0.0,
 'chemical': 0.0,
 'fold': 0.0,
 'cancellation': 0.005952380952380952,
 'preheat': 0.0,
 'evocative': 0

In [122]:
import numpy as np
word_counts_texts_dict_list = []
for key, value in word_counts_texts.items():
    word_counts_texts_dict_list.append(value)
    
N = len(word_counts_texts_dict_list)

idf_dict = dict.fromkeys(word_counts_texts_dict_list[0].keys(), 0)

for word in idf_dict:
    idf_dict[word] = sum(doc[word] > 0 for doc in word_counts_texts_dict_list)

for word, df in idf_dict.items():
    idf_dict[word] = np.log10((N + 1.0) / (df + 1.0))
    
pd.DataFrame([idf_dict])
    


Unnamed: 0,yesterday,it,producing,eggs,waterproof,plan,aiming,does,b,transportation,...,reef,save,release,capabilities,ravaged,golden,is,25,love,through
0,0.90309,0.90309,0.90309,0.90309,0.90309,0.90309,0.90309,0.90309,0.90309,0.90309,...,0.90309,0.90309,0.90309,0.90309,0.90309,0.90309,0.60206,0.90309,0.90309,0.90309


In [None]:
import numpy as np


def idf(word_counts: list[dict[str, int]]) -> dict:
    """Given N documents, no. of documents in which the the term appears for each term"""
    idf_dict = {}
    N = len(word_counts)

    idf_dict = dict.fromkeys(word_counts[0].keys(), 0)

    for word in idf_dict.keys():
        idf_dict[word] = sum(doc[word] > 0 for doc in word_counts)

    for word, df in idf_dict.items():
        idf_dict[word] = np.log10((N + 1.0) / (df + 1.0))

    return idf_dict


idfs = idf([word_count_a, word_count_b])
idfs