In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def eliminateStop(words):
    returnList = [w for w in words if not w in stop_words]
    return returnList

def main():
    # Read In Data and Format It Better
    df = pd.read_csv('spam.csv', sep=',', encoding='ISO-8859-1')
    df = df.drop(columns=[ "Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

    # Add 2 New Columns: For binary spam/ham indicator and length of message
    df['Result']= df['Classification'].map( {'spam' : int(1), 'ham' : int(0)})
    df['Message_Size'] = df['Text'].apply(len)


    # Calculate Statistics
    totalMessages = df['Result'].count()
    numSpams = df[df['Result']==1]['Result'].count()
    numValid = df[df['Result'] == 0]['Result'].count()

    # Print Distribution
    print(f'{numSpams} of {totalMessages} messages are spam: {((numSpams/totalMessages)*100)}%')
   
    # Tokenize Each Message and add the list of tokens to the DataFrame
    df['Tokens'] = df['Text'].apply(word_tokenize)
    
    # Add Tokenize With Removed Stop Words
    stop_words = set(stopwords.words("english"))
    df['Filtered_Tokens'] = df['Tokens'].apply(eliminateStop)
    
    print(df.head())
    
if __name__ =='__main__':
    main()

747 of 5572 messages are spam: 13.406317300789663%
  Classification                                               Text  Result  \
0            ham  Go until jurong point, crazy.. Available only ...       0   
1            ham                      Ok lar... Joking wif u oni...       0   
2           spam  Free entry in 2 a wkly comp to win FA Cup fina...       1   
3            ham  U dun say so early hor... U c already then say...       0   
4            ham  Nah I don't think he goes to usf, he lives aro...       0   

   Message_Size                                             Tokens  \
0           111  [Go, until, jurong, point, ,, crazy, .., Avail...   
1            29           [Ok, lar, ..., Joking, wif, u, oni, ...]   
2           155  [Free, entry, in, 2, a, wkly, comp, to, win, F...   
3            49  [U, dun, say, so, early, hor, ..., U, c, alrea...   
4            61  [Nah, I, do, n't, think, he, goes, to, usf, ,,...   

                                     Filtered_Tokens 