In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

def eliminateStop(words, stop_words):
    returnList = [w for w in words if not w in stop_words]
    return returnList

def main():
    # Read In Data and Format It Better
    df = pd.read_csv('spam.csv', sep=',', encoding='ISO-8859-1')
    df = df.drop(columns=[ "Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

    # Add 2 New Columns: For binary spam/ham indicator and length of message
    df['Result']= df['Classification'].map( {'spam' : int(1), 'ham' : int(0)})
    df['Message_Size'] = df['Text'].apply(len)



    # Calculate Statistics
    totalMessages = df['Result'].count()
    numSpams = df[df['Result']==1]['Result'].count()
    numValid = df[df['Result'] == 0]['Result'].count()
   
    '''
    vocab = []
    # Get a total word count for unfiltered data
    for mes in df['Text']:
        for word in mes.split():
            if word.lower() not in vocab:
                vocab.append(word.lower())
    print(len(vocab))
    '''

    # Print Distribution
    print(f'{numSpams} of {totalMessages} messages are spam: {((numSpams/totalMessages)*100)}%')
   
    # Tokenize Each Message and add the list of tokens to the DataFrame
    df['Tokens'] = df['Text'].apply(word_tokenize)
    
    # Add Tokenize With Removed Stop Words
    stop_words = set(stopwords.words("english"))
    df['Filtered_Tokens'] = df['Tokens'].apply(eliminateStop, args=(stop_words,),)
    
    '''
    vocab2 = []
    for entry in df['Filtered_Tokens']:
        for tok in entry:
            if tok.lower() not in vocab2:
                vocab2.append(tok.lower())
    print(len(vocab2))
    '''
    
    # Create the Naive Bayes Classifier Object
    classifier = MultinomialNB()
    
    targs = df['Result'].values
    # "Vectorize" The Messages
    vectorizer = CountVectorizer()
    counts = vectorizer.fit_transform(df['Text'].values)
    
    print(f'Shape of counts vector: {counts.get_shape()}')

    # Train the data
    classifier.fit(counts, targs)
    
    # Test With Example Data
    examples = ['Free things !!! Get it to win big prize!', 'Hello, Susan. Going to game']
    examples_counts = vectorizer.transform(examples)
    predictions = classifier.predict(examples_counts)
    # Should print [1, 0] for (spam, ham)
    print(f' Prediction: {predictions}')
    
    # Test with the new set :)
    testDF = pd.read_csv('testset.csv', sep=',', encoding='ISO-8859-1')
    testDF['Result']= testDF['type'].map( {'spam' : int(1), 'ham' : int(0)})

    predictionsFull = classifier.predict(vectorizer.transform(testDF['text']))
    
    
    countRight = 0
    for i in range(len(predictionsFull)):
        if (predictionsFull[i] == testDF['Result'][i]):
            countRight+=1
            
    size = df['Result'].size
    print(f'Success Rate: {countRight / size * 100}%')
    
    
    # Confusion matrix
    print("Confusion Matrix: ")
    cm = confusion_matrix(testDF['Result'], predictionsFull)
    print(cm)
    
    #Accuracy Score 
    print("Accuracy: ")
    print(metrics.accuracy_score(testDF['Result'], predictionsFull))
    # Display the confusion matrix
   
    
    
if __name__ =='__main__':
    main()

747 of 5572 messages are spam: 13.406317300789663%
Shape of counts vector: (5572, 8672)
 Prediction: [1 0]
Success Rate: 99.03086862885858%
