## This notebook involves using text messages from UC Irvine's spam SMS dataset, to classify text messages as either spam, or not spam

#### Inspired by Redwan Huq's post on In Machines We Trust

In [1]:
#Imports
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, svm
from sklearn.model_selection import (
    train_test_split, learning_curve, StratifiedShuffleSplit, GridSearchCV,
    cross_val_score)

In [2]:
# Load up data
dataSet = pd.read_csv('./spamdata/spamSet.txt', header=None, sep='\t')
dataSet.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
output = dataSet[0]
#Checking out our outputs
output.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [4]:
# Encode the class labels as numbers - it's a binary classification problem
labelEncoder = LabelEncoder()
output_enc = labelEncoder.fit_transform(output)

#Store the raw text separately
raw_text = dataSet[1]

In [5]:
# Access stop words
stop_words = nltk.corpus.stopwords.words('english')
# Acess porter stemmer
porter = nltk.PorterStemmer()
#Use both these tools, and some regex to preprocess the text messages
def processRaw(messy_string):
    assert(type(messy_string) == str)
    cleaned = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', messy_string)
    cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr',
                     cleaned)
    cleaned = re.sub(r'£|\$', 'moneysymb', cleaned)
    cleaned = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
        'phonenumbr', cleaned)
    cleaned = re.sub(r'\d+(\.\d+)?', 'numbr', cleaned)
    cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
    #Eliminate all the stop words after the message is put through the Porter Stemmer - this helps remove noise
    return ' '.join(
        porter.stem(term) 
        for term in cleaned.split()
        if term not in set(stop_words)
    )

In [6]:
example = """  ***** CONGRATlations **** You won 2 tIckETs to Hamilton in 
NYC http://www.hamiltonbroadway.com/J?NaIOl/event   wORtH over $500.00...CALL 
555-477-8914 or send message to: hamilton@freetix.com to get ticket !! !  """
processRaw(example)

'congratl numbr ticket hamilton nyc httpaddr worth moneysymbnumbr call phonenumbr send messag emailaddr get ticket'

In [7]:
#Apply this to our raw text data
processedText = raw_text.apply(processRaw)

In [8]:
# Construct a matrix using an n-gram model and implementing a tf-idf statistic - sklearn has a built in vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) #Let's look only at unigrams and bigrams
X_ngrams = vectorizer.fit_transform(processedText)
X_ngrams.shape

(5572, 36348)

### From this we see that we have a matrix with 30,000+ features, which means we have to use the right type of classifier to handle this much data

In [9]:
# Prepare the training and test sets using an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X_ngrams,
    output_enc,
    test_size=0.2,
    stratify=output_enc
)

# Training an SVM with a linear kernel on the training set
clf = svm.LinearSVC(loss='hinge')
clf.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = clf.predict(X_test)

# Compute the F1 score
metrics.f1_score(y_test, y_pred)

0.9436619718309859

In [10]:
# Display a confusion matrix
pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred),
    index=[['actual', 'actual'], ['spam', 'ham']],
    columns=[['predicted', 'predicted'], ['spam', 'ham']]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,spam,ham
actual,spam,965,1
actual,ham,15,134


### We now have a classifier which can classify text messages at a 90% accuracy rate

In [11]:
def isSpam(message):
    if clf.predict(vectorizer.transform([processRaw(message)])):
        return 'spam'
    else:
        return 'not spam'

In [12]:
isSpam('Hello World! How are you?')

'not spam'

In [13]:
isSpam('BUY FREE APARTMENT IN MIDTOWM CALL 9173823273')

'spam'

### However, 90% still leaves some margin for error - let's try and up this accuracy rate a bit