In [8]:
# Assume spam_words.txt in the same folder
import os, re

def get_keywords(file):
    """read keywords from file to list
    text file structure: one word one line
    (exception handling ommited here)
    """
    with open(file, 'r') as f:
      return f.read().split('\n')
  
def remove_stopwords(text, stopword=[]):
    """Not implemented by empty stopword list"""
    return text

def clean(text):
    """"prepare string data
    remove leading & ending whitespace, 
      line breaks, punctuations
      from text in string literal
    do uppercase transformation
    """
    text = text.lower().strip().replace('\n', '')
    return re.sub(r'[^a-z0-9\w\s]', '', text)

# pipe
def calculate_score(arg, display=True): 
    """Calculate spam score based on the occurrence of spam keywords."""
    message, keywords = clean(arg[0]), arg[1]
    spam_points = 0
    spam_words_found = set()

    for keyword in keywords:
        count = message.count(keyword)
        if count > 0:
            spam_points += count
            spam_words_found.add(keyword)

    if display: 
        print("Spam keywords found:", spam_words_found)
    
    # Extract non-spam words
    pattern = '|'.join(spam_words_found)
    ham_words = re.sub(pattern, '', message).split()
    if display: 
        print("Non-spam words:", ham_words)

    ham_points = len(ham_words)
    score = (spam_points, ham_points, spam_points / max(ham_points, 1))  # Avoid division by zero
    return score

# pipe
def classify_email(score):
    """score: tuple(int, int, float)
    score[0]: spam points
    score[1]: ham points
    score[2]: ratio of score[0] to score[1]
    Classification intervals are user-specific
    """
    likelihood = score[2]
    if likelihood <= 0.2:
        return ("Likely not spam")
    elif likelihood <= 0.4:
        return ("Possible spam")
    else:
        return ("Likely spam")

# pipe
def display(data):
    if isinstance(data, tuple):
      for element in data: print(element)
    print(data)
    return data

# Define a pipeline
def pipeline(data, *kwargs): 
    end = data
    # unpack each function name argument in for-loop
    for fun in kwargs: 
      end = fun(end)
    return end

if __name__ == '__main__':
    # read spam words from file into list
    filename = 'spam_words.txt'
    spam_words = get_keywords(filename)
    # prompt raw message
    email = """Urgent! \nPlease verify your bank account by
    clicking the link: ACTION REQUIRED. Please verify your
    Bank of America account information to avoid a hold on
    your account. Click here to confirm: [Link]"""
    keywords = spam_words
    # group in tuple
    raw = (email, keywords)
    # run pipeline to display spam score and likelihood
    pipeline(raw, calculate_score, classify_email, display)

Spam keywords found: {''}
Non-spam words: ['urgent', 'please', 'verify', 'your', 'bank', 'account', 'by', 'clicking', 'the', 'link', 'action', 'required', 'please', 'verify', 'your', 'bank', 'of', 'america', 'account', 'information', 'to', 'avoid', 'a', 'hold', 'on', 'your', 'account', 'click', 'here', 'to', 'confirm', 'link']
Likely spam


In [6]:
# Assume spam_words.txt in the same folder
import os, re

def get_keywords(file):
    """read keywords from file to list
    text file structure: one word one line
    (exception handling ommited here)
    """
    with open(file, 'r') as f:
      return f.read().split('\n')
  
def remove_stopwords(text, stopword=[]):
    """Not implemented by empty stopword list"""
    return text

def clean(text):
    """prepare string data
    remove leading & ending whitespace, 
      line breaks, punctuations
      from text in string literal
    do uppercase transformation
    """
    text = text.lower().strip().replace('\n', '')
    return re.sub(r'[^\w\s]', '', text)

# pipe
def calculate_score(arg, display=True): 
    """arg: (str, List[str])
    arg[0]: email text in string
    arg[1]: List[str]: spam keywords
    """
    message, keywords = clean(arg[0]), arg[1]
    message = remove_stopwords(message) # not-implemented
    # calculate spam points; 
    # collect occurred spam words
    spam_points = 0
    spam = set()
    for keyword in keywords:
        if message.count(keyword) != 0:
            spam_points += message.count(keyword)
            spam.add(keyword)
    if display: print(spam)
    # extract ham words
    pattern = '|'.join(spam)
    ham = re.sub(pattern, '', message)
    # ham points
    ham = ham.split()
    if display: print(ham)
    ham_points = len(ham)
    # spamicity (likelihood)
    score = spam_points/ham_points
    return (spam_points, ham_points, score)

# pipe
def classify_email(score):
    """score: tuple(int, int, float)
    score[0]: spam points
    score[1]: ham points
    score[2]: ratio of score[0] to score[1]
    Classification intervals are user-specific
    """
    likelihood = score[2]
    if likelihood <= 0.2:
        return ("Likely not spam")
    elif likelihood <= 0.4:
        return ("Possible spam")
    else:
        return ("Likely spam")

# pipe
def display(data):
    if isinstance(data, tuple):
      for element in data: print(element)
    print(data)
    return data

# Define a pipeline
def pipeline(data, *kwargs): 
    end = data
    # unpack each function name argument in for-loop
    for fun in kwargs: 
      end = fun(end)
    return end

if __name__ == '__main__':
    # read spam words from file into list
    filename = 'spam_words.txt'
    spam_words = get_keywords(filename)
    # prompt raw message
    email = """Urgent! \nPlease verify your bank account by
    clicking the link: ACTION REQUIRED. Please verify your
    Bank of America account information to avoid a hold on
    your account. Click here to confirm: [Link]"""
    keywords = spam_words
    # group in tuple
    raw = (email, keywords)
    # run pipeline to display spam score and likelihood
    pipeline(raw, calculate_score, classify_email, display)

{''}
['urgent', 'please', 'verify', 'your', 'bank', 'account', 'by', 'clicking', 'the', 'link', 'action', 'required', 'please', 'verify', 'your', 'bank', 'of', 'america', 'account', 'information', 'to', 'avoid', 'a', 'hold', 'on', 'your', 'account', 'click', 'here', 'to', 'confirm', 'link']
Likely spam


In [7]:
import os, re

def get_keywords(file):
    """read keywords from file to list
    text file structure: one word one line
    (exception handling added)
    """
    try:
        with open(file, 'r') as f:
            return f.read().splitlines()
    except Exception as e:
        print(f"Error reading file: {e}")
        return []

def remove_stopwords(text, stopword=[]):
    """Remove stopwords from the text"""
    return ' '.join(word for word in text.split() if word not in stopword)
  
def clean(text):
    """prepare string data
    remove leading & ending whitespace, 
    line breaks, punctuations
    from text in string literal
    do lowercase transformation
    """
    text = text.lower().strip().replace('\n', '')
    return re.sub(r'[^\w\s]', '', text)

def calculate_score(arg, display=True): 
    """arg: (str, List[str])
    arg[0]: email text in string
    arg[1]: List[str]: spam keywords
    """
    message, keywords = clean(arg[0]), arg[1]
    message = remove_stopwords(message) # not-implemented
    spam_points = 0
    spam = set()
    for keyword in keywords:
        count = message.count(keyword)
        if count != 0:
            spam_points += count
            spam.add(keyword)
    if display: print(spam)
    
    # escape keywords for regex
    pattern = '|'.join(re.escape(keyword) for keyword in spam)
    ham = re.sub(pattern, '', message) if pattern else message

    ham = ham.split()
    if display: print(ham)
    ham_points = len(ham) or 1  # Avoid division by zero
    score = spam_points / float(ham_points)  # Use float division
    return (spam_points, ham_points, score)

def classify_email(score):
    likelihood = score[2]
    if likelihood <= 0.2:
        return "Likely not spam"
    elif likelihood <= 0.4:
        return "Possible spam"
    else:
        return "Likely spam"

def display(data):
    if isinstance(data, tuple):
        for element in data: print(element)
    else:
        print(data)
    return data

def pipeline(data, *kwargs): 
    end = data
    for fun in kwargs: 
        end = fun(end)
    return end

if __name__ == '__main__':
    filename = 'spam_words.txt'
    spam_words = get_keywords(filename)
    email = """Urgent! \nPlease verify your bank account by
    clicking the link: ACTION REQUIRED. Please verify your
    Bank of America account information to avoid a hold on
    your account. Click here to confirm: [Link]"""
    keywords = spam_words
    raw = (email, keywords)
    pipeline(raw, calculate_score, classify_email, display)


set()
['urgent', 'please', 'verify', 'your', 'bank', 'account', 'by', 'clicking', 'the', 'link', 'action', 'required', 'please', 'verify', 'your', 'bank', 'of', 'america', 'account', 'information', 'to', 'avoid', 'a', 'hold', 'on', 'your', 'account', 'click', 'here', 'to', 'confirm', 'link']
Likely not spam
