In [25]:
import numpy as np 
import pandas as pd 
from collections import Counter
import re
import string
import random
import math


In [6]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df.columns = ['text', 'spam']
    df.dropna(inplace=True)
    return df


In [9]:
df = load_data(r'F:\EmailDetectionSpam\emails.csv\emails.csv')
print(df.head())
print("Dataset shape: ", df.shape)

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
Dataset shape:  (5728, 2)


Clean text (Lowercase, Remove Puntuation, Numbers, and Emails )

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    text = re.sub(r'\S*@\S*\s?', '', text) # remove emails
    text = re.sub(r"htpp\S+","",text) # remove urls
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    return text.strip()

In [33]:
sample_text = "Hello! This is a test email. Visit http://example.com for more info."
print("Original text: ", sample_text)
print("Cleaned text: ", clean_text(sample_text))


Original text:  Hello! This is a test email. Visit http://example.com for more info.
Cleaned text:  hello this is a test email visit httpexamplecom for more info


Tokenization 

In [15]:
def tokenize(text):
    return text.split()

In [17]:
print(tokenize("This a sample email."))
sample_text = "Hello! This is a test email. Visit http://example.com for more info."
print(tokenize(clean_text(sample_text)))


['This', 'a', 'sample', 'email.']
['hello', 'this', 'is', 'a', 'test', 'email', 'visit', 'httpexamplecom', 'for', 'more', 'info']


In [18]:
STOPWORDS = set([
    'the','is','in','it','and', 'to', 'a', 'for', 'of', 'on','this','that','with', 'at','as','but',
    'by', 'from', 'or','so','if','than','then', 'be', 'was', 'were', 'has', 'have','had', 'he', 'she','they',
    'we', 'you', 'not', 'an', 'are', 'i', 'me', 'my', 'our', 'us', 'your', 'can', 'do', 'does', 'did', 'will', 'shall'  
])

In [19]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in STOPWORDS]

In [20]:
tokens = ["this", "is", "a", "sample", "email"]
print(remove_stopwords(tokens))


['sample', 'email']


In [21]:
LEMMATIZATION = {
    'running': 'run', 'jumps': 'jump', 'easily' : 'easy', 'better': 'good', 'worst': 'bad', 'bigger': 'big',
    'smaller':'small', 'cars': 'car', 'dogs': 'dog', 'cats': 'cat'
}

In [23]:
def lemmatize(tokens):
    return [LEMMATIZATION.get(word, word) for word in tokens]

In [24]:
print(lemmatize(["running", "jumps", "better", "dogs"]))


['run', 'jump', 'good', 'dog']


In [28]:
def compute_tf(text_tokens):
    word_counts = Counter(text_tokens)
    total_words = len(text_tokens)
    return {word: count / total_words for word, count in word_counts.items()}

In [29]:
tokens = ['spam', 'email', 'spam', 'free']
print(compute_tf(tokens))

{'spam': 0.5, 'email': 0.25, 'free': 0.25}


In [26]:
def compute_idf(dataset):
    doc_count = len(dataset)
    word_doc_count = Counter()
    
    for text in dataset:
        words = set(tokenize(clean_text(text)))
        for word in words:
            word_doc_count[word] += 1
    return {word: math.log(doc_count / (1 + count)) for word, count in word_doc_count.items()}


In [30]:
idf_values = compute_idf(df['text'])
print("Sample IDF Values:", list(idf_values.items())[:10])

Sample IDF Values: [('specially', 5.069602770184372), ('distinctive', 5.357284842636153), ('image', 3.970990481516262), ('unlimited', 4.448429089249515), ('iogo', 5.517627492711332), ('no', 1.7265446754177567), ('catchy', 5.434245883772281), ('hand', 3.6491754026950223), ('creativeness', 5.434245883772281), ('hard', 3.4060976364799953)]


In [31]:
def compute_tf_idf(text_tokens, idf_values):
    """Computes TF-IDF score for each word."""
    tf = compute_tf(text_tokens)
    return {word: tf[word] * idf_values.get(word, 0) for word in tf}

In [32]:
tokens = ["spam", "email", "spam", "free"]
print(compute_tf_idf(tokens, idf_values))


{'spam': 2.4821211272632726, 'email': 0.42130312670964937, 'free': 0.5320230126992549}
