In [None]:
import os
import re

class SpamDetector:
    def __init__(self, spam_file):
        self.spam_words = self.get_keywords(spam_file)
        # Optional: Initialize stop_words

    def get_keywords(self, file):
        with open(file, 'r') as f:
            return f.read().split('\n')
    
    def clean_text(self, text):
        text = text.lower().strip().replace('\n', '')
        return re.sub(r'[^a-z0-9\'\w\s]', '', text)

    def calculate_spam_score(self, text):
        text = self.clean_text(text)
        # Optional: text = self.remove_stop_words(text)
        spam_points, ham_points = 0, 0
        for word in text.split():
            if word in self.spam_words:
                spam_points += 1
            else:
                ham_points += 1
        return spam_points, ham_points

    def classify_email(self, email_text):
        spam_points, ham_points = self.calculate_spam_score(email_text)
        score = spam_points / max(ham_points, 1)  # Avoid division by zero
        if score <= 0.2:
            return "Likely not spam"
        elif score <= 0.4:
            return "Possible spam"
        else:
            return "Likely spam"

    # Additional methods like remove_stop_words can be added here.

if __name__ == '__main__':
    detector = SpamDetector('spam_words.txt')
    email = """Urgent! \nPlease verify your bank account by
    clicking the link: ACTION REQUIRED. Please verify your
    Bank of America account information to avoid a hold on
    your account. Click here to confirm: [Link]"""
    print(detector.classify_email(email))


In [None]:
import os
import re

class SpamDetector:
    def __init__(self, spam_file, stop_words_file=None):
        self.spam_words = self.get_keywords(spam_file)
        self.stop_words = self.get_keywords(stop_words_file) if stop_words_file else []

    # ------------------------------------------------------------------------------------
    def get_keywords(self, file):
        if file and os.path.exists(file):
            with open(file, 'r') as f:
                return set(f.read().split('\n'))
        return set()
    
    # ------------------------------------------------------------------------------------
    def clean_text(self, text):
        text = text.lower().strip().replace('\n', '')
        text = re.sub(r'[^a-z0-9\'\w\s]', '', text)
        return ' '.join([word for word in text.split() if word not in self.stop_words])

    # ------------------------------------------------------------------------------------
    def calculate_spam_score(self, text):
        text = self.clean_text(text)
        spam_points, ham_points = 0, 0
        for word in text.split():
            if word in self.spam_words:
                spam_points += 1
            else:
                ham_points += 1
        return spam_points, ham_points

    # ------------------------------------------------------------------------------------
    def classify_email(self, email_text):
        spam_points, ham_points = self.calculate_spam_score(email_text)
        score = spam_points / max(ham_points, 1)  # Avoid division by zero
        if score <= 0.2:
            return "Likely not spam"
        elif score <= 0.4:
            return "Possible spam"
        else:
            return "Likely spam"
        
        
    '''
    *------------------*
    |                  |
    |     PREPARE      |
    |                  |
    *------------------*
    '''
    
    def preprocess_text(self, text):
        text = self.basic_clean(text)
        text = self.tokenize(text)
        text = self.lemmatize(text)  # or self.stem(text)
        text = self.remove_stopwords(text)
        return text

    # ------------------------------------------------------------------------------------
    # Redefine calculate_spam_score to use preprocess_text
    def calculate_spam_score(self, text):
        text = self.preprocess_text(text)

    # ------------------------------------------------------------------------------------
    def basic_clean(string):
        """
        Lower Case:
        - setting all letters to a lowercase

        Encoding:
        - `unicodedata.normalize` removes any inconsistencies in unicode character encoding
        - `.encode` to convert the resulting string to the ASCII character set
        - `.decode` to turn the resulting bytes object back into a string

        Special characters:
        - remove anything that isn't a-z, a number, a single quote, or a whitespace
        """
        # lowercase text
        string = string.lower()

        # remove any accented characters and non-ASCII characters
        # normalizing
        # getting ride of anything not in ascii
        # turning back to a string
        string = unicodedata.normalize('NFKD', string).encode('ascii','ignore').decode('utf-8')

        # remove special characters
        #use re.sub to remove special characters
        bc_string = re.sub(r'[^a-z0-9\'\s]', '', string)

        return bc_string
    # ------------------------------------------------------------------------------------
    def tokenize(string):
        """
        Tokenization is the process of breaking something down
        into smaller, discrete units. These units are called tokens.

        It's common to tokenize the strings to break up words and punctutation
        left over into discrete units. 
        """  

        #create the tokenizer
        tokenize = nltk.tokenize.ToktokTokenizer()
        tok_string = tokenize.tokenize(string, return_str=True)

        return tok_string

    # ------------------------------------------------------------------------------------
    def stem(string):
        """
        Stemming:
        - **truncates** words to their "stem"
        - algorithmic rules (non lingustic)
        - example: "calls", "called", "calling" --> "call"
        - fast and efficient
        """   
        #create porter stemmer
        ps = nltk.porter.PorterStemmer()

        #use stemmer - apply stem to each word in our string
        ps.stem(string)

        # split all the words in the article
        string.split()
        stems = [ps.stem (word) for word in string.split()]

        #join words back together
        string_stemmed = ' '.join(stems)

        return string_stemmed

    # ------------------------------------------------------------------------------------
    def lemmatize(string):
        """
        Lemmatize:
            - **changes** words to their "root"
            - it can conjugate to the base word 
            - example: "mouse", "mice" --> "mouse"
            - slower than stemming
        """ 
        #create the lemmatizer   
        wnl = nltk.stem.WordNetLemmatizer()

        #use lemmatize - apply stem to each word in our string
        # wnl.lemmatize(article)
        lemma = [wnl.lemmatize(word) for word in string.split()]

        #join words back together
        string_lemma = ' '.join(lemma)

        return string_lemma

    # ------------------------------------------------------------------------------------
    def remove_stopwords(string, string_lemma):
        """
        Words which have little or no significance, especially when constructing
        meaningful features from text, are known as stopwords.
        - example: a, an, the, and like

        We will use a standard English language stopwords list from nltk
        """
        #save stopwords
        stopwords_ls = stopwords.words('english')

        #split words in lemmatized article
        words = string_lemma.split()

        #remove stopwords from list of words
        filtered = [word for word in words if word not in stopwords_ls]

        #join words back together
        rem_stopwords = ' '.join(filtered)

        return rem_stopwords

    # ------------------------------------------------------------------------------------
    def remove_stopwords_extra_words(string_lemma, extra_words, exclude_words):
        """
        Words which have little or no significance, especially when constructing
        meaningful features from text, are known as stopwords.
        - example: a, an, the, and like

        We will use a standard English language stopwords list from nltk
        """
        #save stopwords
        stopwords_ls = stopwords.words('english')

        # remove extra words
        stopwords_ls = set(stopwords_ls) - set(exclude_words)

        # add to stopword list
        stopwords_ls = set(stopwords_ls).union(extra_words)

        #split words in lemmatized article
        words = string_lemma.split()

        #remove stopwords from list of words
        filtered = [word for word in words if word not in stopwords_ls]

        #join words back together
        rem_stopwords = ' '.join(filtered)

        return rem_stopwords

    # ------------------------------------------------------------------------------------
    ADDITIONAL_STOPWORDS = ['r', 'u', '2', '4', 'ltgt']

    def clean(text):
        '''
        A simple function to cleanup text data.

        Args:
            text (str): The text to be cleaned.

        Returns:
            list: A list of lemmatized words after cleaning.
        '''

        # basic_clean() function from last lesson:
        # Normalize text by removing diacritics, encoding to ASCII, decoding to UTF-8, and converting to lowercase
        text = (unicodedata.normalize('NFKD', text)
                 .encode('ascii', 'ignore')
                 .decode('utf-8', 'ignore')
                 .lower())

        # Remove punctuation, split text into words
        words = re.sub(r'[^\w\s]', '', text).split()

        # Initialize WordNet lemmatizer
        wnl = nltk.stem.WordNetLemmatizer()

        # Combine standard English stopwords with additional stopwords
        stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS

        # Lemmatize words and remove stopwords
        cleaned_words = [wnl.lemmatize(word) for word in words if word not in stopwords]

        return cleaned_words

if __name__ == '__main__':
    detector = SpamDetector('spam_words.txt', 'stop_words.txt')
    email = """Urgent! \nPlease verify your bank account by
    clicking the link: ACTION REQUIRED. Please verify your
    Bank of America account information to avoid a hold on
    your account. Click here to confirm: [Link]"""
    print(detector.classify_email(email))


In [2]:
import os
import re
import unicodedata
import nltk
from nltk.corpus import stopwords

class SpamDetector:
    def __init__(self, spam_file):
        self.spam_words = self.get_keywords(spam_file)

    def get_keywords(self, file):
        if file and os.path.exists(file):
            with open(file, 'r') as f:
                return {word.lower() for word in f.read().split('\n') if word}
        return set()
        
    def clean_text(self, text):
        text = text.lower().strip().replace('\n', '')
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
        text = re.sub(r'[^a-z0-9\'\s]', '', text)
        return text
    
    def display_sample_spam_words(self):
        print("Sample Spam Words:", list(self.spam_words)[:10])  # Display first 10 spam words

    def preprocess_text(self, text):
        text = self.clean_text(text)
        words = text.split()
        wnl = nltk.stem.WordNetLemmatizer()
        stopwords_set = set(stopwords.words('english'))
        lemmatized_words = [wnl.lemmatize(word) for word in words if word not in stopwords_set]
        print(lemmatized_words)
        return ' '.join(lemmatized_words)

    def calculate_spam_score(self, text):
        original_text = self.clean_text(text).split()
        preprocessed_text = self.preprocess_text(text).split()

        print("Original Text Words:", original_text)
        print("Preprocessed Text Words:", preprocessed_text)

        spam_points, ham_points = 0, 0

        # Decide whether to use original_text or preprocessed_text here
        for word in original_text:  # or preprocessed_text
            if word in self.spam_words:
                spam_points += 1
                print(f"Spam Word Detected: {word}")
            else:
                ham_points += 1

        # Calculate spam and ham ratios
        total_words = len(original_text)  # or len(preprocessed_text)
        spam_ratio = spam_points / max(total_words, 1)
        ham_ratio = ham_points / max(total_words, 1)

        return spam_points, ham_points, spam_ratio, ham_ratio



    def classify_email(self, email_text):
        spam_points, ham_points, spam_ratio, ham_ratio = self.calculate_spam_score(email_text)
        score = spam_points / max(ham_points, 1)
        
        # Display metrics
        print(f"Spam Points: {spam_points}")
        print(f"Ham Points: {ham_points}")
        print(f"Spam Ratio: {spam_ratio:.2f}")
        print(f"Ham Ratio: {ham_ratio:.2f}")

        # Classification
        if score <= 0.2:
            return "Likely not spam"
        elif score <= 0.4:
            return "Possible spam"
        else:
            return "Likely spam"

if __name__ == '__main__':
    detector = SpamDetector('spam_words.txt')
    email = """Urgent! \nPlease verify your bank account by
    clicking the link: ACTION REQUIRED. Please verify your
    Bank of America account information to avoid a hold on
    your account. Click here to confirm: [Link]"""
    print(detector.classify_email(email))

['urgent', 'please', 'verify', 'bank', 'account', 'clicking', 'link', 'action', 'required', 'please', 'verify', 'bank', 'america', 'account', 'information', 'avoid', 'hold', 'account', 'click', 'confirm', 'link']
Original Text Words: ['urgent', 'please', 'verify', 'your', 'bank', 'account', 'by', 'clicking', 'the', 'link', 'action', 'required', 'please', 'verify', 'your', 'bank', 'of', 'america', 'account', 'information', 'to', 'avoid', 'a', 'hold', 'on', 'your', 'account', 'click', 'here', 'to', 'confirm', 'link']
Preprocessed Text Words: ['urgent', 'please', 'verify', 'bank', 'account', 'clicking', 'link', 'action', 'required', 'please', 'verify', 'bank', 'america', 'account', 'information', 'avoid', 'hold', 'account', 'click', 'confirm', 'link']
Spam Points: 0
Ham Points: 32
Spam Ratio: 0.00
Ham Ratio: 1.00
Likely not spam
