In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # Import TF IDF 
from typing import List 
import math
import re

In [4]:
df = pd.read_csv('mail_data.csv')
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [6]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
print(df['Message'][1])

Ok lar... Joking wif u oni...


Implement `TF IDF`

In [32]:
import re
import math

def compute_tfidf(documents):
    # 1. Tokenize and clean a document
    def tokenize(doc):
        doc = re.sub(r'[\t\n\r]', ' ', doc)                 # Replace tabs/newlines
        doc = re.sub(r'[^\w\s]', '', doc.lower())           # Remove punctuation and lowercase
        doc = re.sub(r'\s+', ' ', doc).strip()              # Normalize whitespace
        return doc.split()                                  # Tokenize by splitting words

    # 2. Tokenize all documents
    tokenized_docs = [tokenize(doc) for doc in documents]

    # 3. Build vocabulary
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))

    # 4. Compute Term Frequency (TF)
    def compute_tf(doc_tokens):
        tf = {}
        total_terms = len(doc_tokens)
        if total_terms == 0:
            return {word: 0 for word in vocab}
        for word in vocab:
            tf[word] = doc_tokens.count(word) / total_terms
        return tf

    tf_list = [compute_tf(doc) for doc in tokenized_docs]

    # 5. Compute Document Frequency (DF)
    def compute_df(docs):
        df = {}
        for word in vocab:
            df[word] = sum(1 for doc in docs if word in doc)
        return df

    df = compute_df(tokenized_docs)

    # 6. Compute Inverse Document Frequency (IDF)
    def compute_idf(df, N):
        idf = {}
        for word, doc_count in df.items():
            idf[word] = math.log(N / (1 + doc_count))
        return idf

    idf = compute_idf(df, len(documents))

    # 7. Compute TF-IDF for each document
    def compute_tfidf_vector(tf, idf):
        tfidf = {}
        for word in vocab:
            tfidf[word] = tf.get(word, 0) * idf.get(word, 0)
        return tfidf

    tfidf_list = [compute_tfidf_vector(tf, idf) for tf in tf_list]

    # 8. Round and clean up result
    result = []
    for tfidf in tfidf_list:
        result.append({word: round(score, 4) for word, score in tfidf.items() if score > 0})

    return result


In [35]:
dataset = compute_tfidf(df['Message'])

In [38]:
print(dataset[1])

{'joking': 1.1133, 'lar': 0.8313, 'ok': 0.5033, 'oni': 1.1693, 'u': 0.3224, 'wif': 0.8883}


# Implement TF-IDF using scikit-learn
1. Using data(a list of sentence)
2. Convert these text documents into numerical vectors
    - Represent how important each word is in that document
    - Based on its frequency(TF) and inverse document frequency(IDF)
3. `fit()` the data
    - Scans the text data and creates a dictionary of all word `TfidfVectorizer()`
    - Calculate how rare or common each word is across all documents(IDF) `.fit(documents)`
4. `transform()` the data(application step)
    - Convert the input documents into a numerical matrix base on the vocabulary and  IDF that were learned during `fit()`

In [48]:
scikitlearn_data = df['Message'].dropna().astype(str) # Remove missing row
# Create the vectorizer
scikitlearn_data = scikitlearn_data.str.lower().replace(r'[^\w\s]','',regex=True)
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform text
tfidf_matrix = vectorizer.fit_transform(scikitlearn_data)

In [52]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

tfidf_matrix.head()

AttributeError: 'csr_matrix' object has no attribute 'head'