In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB


In [2]:
spam = pd.read_csv('spam.csv', encoding="ISO-8859-1")
spam

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
# check class distribution
classes = spam[spam.columns[0]]
print(classes.value_counts())

ham     4825
spam     747
Name: v1, dtype: int64


#### Data Preprocessing


In [4]:
# convert class labels to binary values, 0 = ham  1 = spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

# quick check
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: v1, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [5]:
# store SMS message data
text_messages = spam[spam.columns[1]]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: v2, dtype: object


In [6]:
# expressions can be found at https://regexlib.com/
# use regular expressions to replace email addresses, urls, phone numbers, etc.

# replace email addresses with 'emailaddr
processed = text_messages.str.replace(r'^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$', 'emailaddr', regex=True)

# replace urls with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress', regex=True)
# replace money symbols with 'moneysymb'
processed = text_messages.str.replace(r'£|\$', 'moneysymb', regex=True)

# replace 10 digit phone numbers with 'phonenum'
processed = text_messages.str.replace(r'^[2-9]\d{2}-\d{3}-\d{4}$', 'phonenum', regex=True)

# replace normal numbers with 'num'
processed = text_messages.str.replace(r'\d+(\.\d+)?', 'num', regex=True)


In [7]:
# remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

  processed = processed.str.replace(r'[^\w\d\s]', ' ')
  processed = processed.str.replace(r'\s+', ' ')
  processed = processed.str.replace(r'^\s+|\s+?$', '')


In [8]:
# change all words to lower case
processed = processed.str.lower()
processed.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in num a wkly comp to win fa cup fi...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: v2, dtype: object

In [9]:
nltk.download('stopwords')

# remove stop words from text messages
# stop words are basically a set of commonly used words in any language such as i, me, to, it, etc.

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

processed.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry num wkly comp win fa cup final tkts...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: v2, dtype: object

In [10]:
%pip install pickle


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle

[notice] A new release of pip available: 22.2.2 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


#### Tokenization

In [14]:
from nltk.tokenize import word_tokenize
nltk.download('punkt', 'english')
# create bag-of-words
all_words = []

for message in processed:
    words = nltk.word_tokenize(message)
    for w in words:
        all_words.append(w)

# FreqDist class is used to encode “frequency distributions”, which count the number of times that each outcome of an experiment occurs

all_words = nltk.FreqDist(all_words)

[nltk_data] Downloading package punkt to english...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\HP/nltk_data'
    - 'c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python39\\nltk_data'
    - 'c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python39\\share\\nltk_data'
    - 'c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python39\\lib\\nltk_data'
    - 'C:\\Users\\HP\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [13]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(10)))

Number of words: 0


AttributeError: 'list' object has no attribute 'most_common'

In [None]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [None]:
# find_features function will determine which of the 1500 word features are contained in the email/message
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# example
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)


The above words are key words that were saved as apart of the features (aka most common words) list that were found in the very first message.



In [None]:
# do it for all the messages
messages = list(zip(processed, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

Split data into testing and training sets.




# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [None]:
print('Training:',len(training))
print('Testing:',len(testing))

#### Scikit-Learn Classifier with NLTK

In [None]:
model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

In [None]:
# define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression()
]

models = zip(names, classifiers)
for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))


In [None]:
# ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression()
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

#### Naive Bayes Classifier Algorithm


In [None]:
model = MultinomialNB()
model.fit(x_train,ytrain)

x_test = cv.fit_transform(xtest)



In [None]:
x_test.toarray()


In [None]:
model.score(x_train,ytrain)
