In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
df=pd.read_csv('SMSSpamCollection',header=None,encoding='utf-8',sep='\t')

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [5]:
df[0].value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [6]:
#converting categorical data in numeric labels
# 0=ham,1=spam
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
classes=df[0]
Y=encoder.fit_transform(classes)

In [7]:
text_messages=df[1]
text_messages.sample(2)

3467    Actually fuck that, just do whatever, do find ...
4077    87077: Kick off a new season with 2wks FREE go...
Name: 1, dtype: object

In [8]:
# using regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replacing email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replacing URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replacing money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymbol')
    
# Replacing 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumber')
    
# Replacing numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')

# Removing punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replacing whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Removing leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [9]:
# changing words to lower case - Ex-Prashant,prashAnt -> prashant
processed = processed.str.lower()
processed.sample(5)

4738    nt only for driving even for many reasons she ...
4353                     hows the pain dear y r u smiling
4902    taka lor wat time u wan number come n look num...
1672                               glad to see your reply
4056    when is school starting where will you stay wh...
Name: 1, dtype: object

In [10]:
from nltk.corpus import stopwords

In [11]:
# removing stop words from text messages

stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [12]:
# Removing word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
# creating bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [15]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6574
Most common words: [('number', 2759), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbolnumb', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [16]:
# using the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [17]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Lets see an example!
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [18]:
processed[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [19]:
# features # run this command to see all the features

In [20]:
# doing it for all the messages
messages = list(zip(processed, Y))

# defining a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)


# calling find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [21]:
# spliting the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# spliting the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [22]:
print(len(training))
print(len(testing))

4179
1393


In [23]:
# using sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.63603732950466


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 93.3237616654702
Decision Tree Accuracy: 98.1335247666906
Random Forest Accuracy: 98.49246231155779
Logistic Regression Accuracy: 98.7078248384781
SGD Classifier Accuracy: 98.42067480258436
Naive Bayes Accuracy: 98.27709978463747
SVM Linear Accuracy: 98.63603732950466


In [25]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 98.63603732950466


In [26]:
# making class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)