In [None]:
# Load the required packages
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, svm
from sklearn.model_selection import (
    train_test_split, learning_curve, StratifiedShuffleSplit, GridSearchCV,
    cross_val_score)
 
# Improve the readability of figures
sns.set_context('notebook', font_scale=1.4)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [None]:
# Load the dataset
df = pd.read_table('data/SMSSpamCollection.txt', header=None)

# Display the first five rows
df.head()

  


Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [None]:
# Store the target variable
y = df[0]

# Display the class distribution
y.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [None]:
# Encode the class labels as numbers
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [None]:
# Store the SMS message data
raw_text = df[1]

In [None]:
example = """  ***** CONGRATlations **** You won 2 tIckETs to Hamilton in 
NYC http://www.hamiltonbroadway.com/J?NaIOl/event   wORtH over $500.00...CALL 
555-477-8914 or send message to: hamilton@freetix.com to get ticket !! !  """

Regex For email address based upon @ and . \b[\w\-.]+?@\w+?\.\w{2,4}\b 
Regex for http(s) address (http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)
Regex for dollars or euros £|\$
Regex for phone numbers: \b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b
Regex for number: \d+(\.\d+)?
Regex to remove punctuation [^\w\d\s]
Regex to remove extra white space with just one white space \s+
Regex to remove leading and trailing whitespace ^\s+|\s+?$

In [None]:
# Replace email addresses with 'emailaddr'
processed = raw_text.str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
                                 'emailaddr')

# Replace URLs with 'httpaddr'
processed = processed.str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
                                  'httpaddr')

# Replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace phone numbers with 'phonenumbr'
processed = processed.str.replace(
    r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
    'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [None]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [None]:
# Lowercase the corpus
processed = processed.str.lower()

In [None]:
# Access stop words
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
# Remove all stop words
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in set(stop_words))
)

In [None]:
# Remove word stems using a Porter stemmer
porter = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(
    porter.stem(term) for term in x.split())
)

In [None]:
def preprocess_text(messy_string):
    assert(type(messy_string) == str)
    cleaned = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', messy_string)
    cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr',
                     cleaned)
    cleaned = re.sub(r'£|\$', 'moneysymb', cleaned)
    cleaned = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
        'phonenumbr', cleaned)
    cleaned = re.sub(r'\d+(\.\d+)?', 'numbr', cleaned)
    cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
    return ' '.join(
        porter.stem(term) 
        for term in cleaned.split()
        if term not in set(stop_words)
    )

In [None]:
(processed == raw_text.apply(preprocess_text)).all()

True

Additionally, let's test `preprocess_text()` on the hypothethical SMS message from earlier.

In [None]:
preprocess_text(example)

'congratl numbr ticket hamilton nyc httpaddr worth moneysymbnumbr call phonenumbr send messag emailaddr get ticket'

In [None]:
# Construct a design matrix using an n-gram model and a tf-idf statistics
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_ngrams = vectorizer.fit_transform(processed)

In [None]:
X_ngrams.shape

(5572, 36348)

In [None]:
# Prepare the training and test sets using an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X_ngrams,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# Train SVM with a linear kernel on the training set
clf = svm.LinearSVC(loss='hinge')
clf.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = clf.predict(X_test)

# Compute the F1 score
metrics.f1_score(y_test, y_pred)

0.9285714285714286

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
param_grid = {
    'C': [0.1, 1, 10,1000, 10000, 100000]
}

In [None]:
classifier_svm = svm.SVC(random_state=42, class_weight="balanced", kernel="linear")

In [None]:
grid_search = GridSearchCV(estimator = classifier_svm, param_grid = param_grid, 
                          cv = StratifiedKFold(5), n_jobs = -1, verbose = 1, scoring = "recall" )

In [None]:
# Identify the optimal regularization hyperparameter
grid_search.fit(X_ngrams, y_enc)

# Train the classifier on the entire dataset using the optimal hyperparameter
final_clf = svm.LinearSVC(loss='hinge', C=grid_search.best_params_['C'])
final_clf.fit(X_ngrams, y_enc);

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   19.2s finished


In [None]:
grid_search.best_estimator_

SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)

In [None]:
# Display the features with the highest weights in the SVM model
pd.Series(
    final_clf.coef_.T.ravel(),
    index=vectorizer.get_feature_names()
).sort_values(ascending=False)[:20]

phonenumbr         4.877622
numbrp             2.811299
txt                2.697428
moneysymbnumbr     2.557909
call phonenumbr    2.339830
numbr              2.094059
mobil              2.064123
servic             1.996360
rington            1.951037
tone               1.792950
claim              1.599671
repli              1.592311
free               1.514682
text               1.513630
stop               1.347235
wap                1.252615
video              1.244379
credit             1.184765
uk                 1.164509
club               1.140393
dtype: float64

In [None]:
def spam_filter(message):
    if final_clf.predict(vectorizer.transform([preprocess_text(message)])):
        return 'spam'
    else:
        return 'not spam'

In [None]:
spam_filter(example)

'spam'

In [None]:
spam_filter('Ohhh, but those are the best kind of foods')

'not spam'