# SMS Spam Detection using NLTK and Scikit-Learn

In [1]:
import pandas as pd
import csv
import nltk

In [2]:
messages = [line.rstrip() for line in open('../data/SMSSpamCollection.csv')]

In [3]:
print(len(messages))

5574


In [4]:
messages = pd.read_csv('../data/SMSSpamCollection.csv', sep='\t', quoting=csv.QUOTE_NONE, names=["label", "message"])

### Basic Data Analysis

In [5]:
data_size=messages.shape
print(data_size)

(5574, 2)


In [6]:
messages_col_names=list(messages.columns)
print(messages_col_names)

['label', 'message']


In [7]:
print(messages.groupby('label').describe())

      message                                                               
        count unique                                                top freq
label                                                                       
ham      4827   4518                             Sorry, I'll call later   30
spam      747    653  Please call our customer service representativ...    4


In [8]:
print(messages.head(3))

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...


In [9]:
message_target=messages['label'] 
print(message_target)

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5569    spam
5570     ham
5571     ham
5572     ham
5573     ham
Name: label, Length: 5574, dtype: object


#### Tokenization is a method to split a sentence/string into substrings. These substrings are called tokens.

In Natural Language Processing (NLP), tokenization is the initial step in preprocessing. Splitting a sentence into tokens helps to remove unwanted information in the raw text such as white spaces, line breaks and so on.

In [10]:
# nltk.download('all')
from nltk.tokenize import word_tokenize
def split_tokens(message):
    message = message.lower()
#     message = message.decode('utf-8') #convert bytes into proper unicode
    word_tokens = word_tokenize(message)
    return word_tokens

In [11]:
messages['tokenized_message'] = messages.apply(lambda row: split_tokens(row['message']), axis=1)

#### Lemmatization is a method to convert a word into its base/root form.

Lemmatizer removes affixes of the words present in its dictionary.

In [12]:
from nltk.stem.wordnet import WordNetLemmatizer
def split_into_lemmas(message):
    lemma = []
    lemmatizer = WordNetLemmatizer()
    for word in message:
        a=lemmatizer.lemmatize(word)
        lemma.append(a)
    return lemma

In [13]:
messages['lemmatized_message'] = messages.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)
print('Tokenized message:',messages['tokenized_message'][11])
print('Lemmatized message:',messages['lemmatized_message'][11])

Tokenized message: ['six', 'chances', 'to', 'win', 'cash', '!', 'from', '100', 'to', '20,000', 'pounds', 'txt', '>', 'csh11', 'and', 'send', 'to', '87575.', 'cost', '150p/day', ',', '6days', ',', '16+', 'tsandcs', 'apply', 'reply', 'hl', '4', 'info']
Lemmatized message: ['six', 'chance', 'to', 'win', 'cash', '!', 'from', '100', 'to', '20,000', 'pound', 'txt', '>', 'csh11', 'and', 'send', 'to', '87575.', 'cost', '150p/day', ',', '6days', ',', '16+', 'tsandcs', 'apply', 'reply', 'hl', '4', 'info']


#### Stop words are commons words that do not add any relevance for classification (For eg. “the”, “a”, “an”, “in” etc.). Hence, it is essential to remove these words.

In [14]:
from nltk.corpus import stopwords
def stopword_removal(message):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = []
    filtered_sentence = ' '.join([word for word in message if word not in stop_words])
    return filtered_sentence

In [15]:
messages['preprocessed_message'] = messages.apply(lambda row: stopword_removal(row['lemmatized_message']),axis=1)
Training_data=pd.Series(list(messages['preprocessed_message']))
Training_label=pd.Series(list(messages['label']))

 #### Bag Of Words(BOW)

 - Bag of Words (BOW) is one of the most widely used methods for generating features in Natural Language Processing.
 - Representing/ Transforming a text into a bag of words helps to identify various measures to characterize the text.
 - Predominantly used for calculating the term (word) frequency or the number of times a term occurs in a document/sentence.
 - It can be used as a feature for training the classifier.
 
#### Term Document Matrix

 - The Term Document Matrix (TDM) is a matrix that contains the frequency of occurrence of terms in a collection of documents.
 - In this matrix, the rows represent terms and columns represent the documents.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tf_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df = (1/len(Training_label)), max_df = 0.7)
Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)
message_data_TDM = Total_Dictionary_TDM.transform(Training_data)

#### Term Frequency Inverse Document Frequency (TFIDF)

 - In a Term Frequency Inverse Document Frequency (TFIDF) matrix, the term importance is expressed by Inverse Document Frequency (IDF).
 - IDF diminishes the weight of the most commonly occurring words and increases the weightage of rare words.



In [17]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),min_df = (1/len(Training_label)), max_df = 0.7)
Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)
message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)

#### Below example will cover classification using Term Document Matrix Dataset

In [18]:
seed=7

In [19]:
from sklearn.model_selection import train_test_split # splitting the data for training and testing
train_data, test_data, train_label, test_label = train_test_split(message_data_TDM, Training_label, test_size=.1, random_state=seed)

In [20]:
from sklearn.tree import DecisionTreeClassifier # creating a decision classifier model
classifier=DecisionTreeClassifier() # model training
classifier = classifier.fit(train_data, train_label) # after being fitted, the model can then be used to predict the output.
message_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Decision Tree Classifier : ', score)

Decision Tree Classifier :  0.9695340501792115


#### Stochastic Gradient Descent Classifier

- This is used for large scale learning.
- This classifier supports different loss functions & penalties for classification.

In [21]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='modified_huber', shuffle=True, random_state=seed)
classifier = classifier.fit(train_data, train_label)
message_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('SGD classifier : ', score)

SGD classifier :  0.9695340501792115


#### Support Vector Machine

Support Vector Machine(SVM) is effective in high-dimensional spaces.

- This is effective in cases where the number of dimensions is greater than the number of samples.
- This works well with a clear margin of separation.

In [22]:
from sklearn.svm import SVC
classifier = SVC(kernel="linear", C=0.025, random_state=seed)
classifier = classifier.fit(train_data, train_label)
message_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('SVM Classifier : ',score)

SVM Classifier :  0.9767025089605734


#### Random Forest Classifier

- Controls over-fitting.
- Here, a random forest fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy.

In [23]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10, random_state=seed)
classifier = classifier.fit(train_data, train_label)
message_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Random Forest Classifier : ', score)

Random Forest Classifier :  0.8566308243727598


In [24]:
# Model Tuning

classifier = RandomForestClassifier(max_depth=5, n_estimators=15, max_features=60, random_state=seed)
classifier = classifier.fit(train_data, train_label)
message_predicted_target = classifier.predict(test_data)
score=classifier.score(test_data, test_label)
print('Random Forest classification after model tuning', score)

Random Forest classification after model tuning 0.8566308243727598


In [25]:
# Stratified Shuffle Split to handle class imbalance
seed=7
from sklearn.model_selection import StratifiedShuffleSplit
# cross validation with 10% sample size
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=seed)
sss.get_n_splits(message_data_TDM, Training_label)
print(sss)

StratifiedShuffleSplit(n_splits=1, random_state=7, test_size=0.1,
            train_size=None)


In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
classifiers = [
    DecisionTreeClassifier(),
    SGDClassifier(loss='modified_huber', shuffle=True),
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(),
    OneVsRestClassifier(svm.LinearSVC()),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10),
   ]
for clf in classifiers:
    score=0
    for train_index, test_index in sss.split(message_data_TDM,Training_label):
        X_train, X_test = message_data_TDM [train_index], message_data_TDM [test_index]
        y_train, y_test = Training_label[train_index], Training_label[test_index]
        clf.fit(X_train, y_train)
        score=score+clf.score(X_test, y_test)
    print(score)

0.9623655913978495
0.967741935483871
0.9695340501792115
0.9014336917562724
0.974910394265233
0.8655913978494624


In [27]:
from sklearn.metrics import accuracy_score
print('Accuracy Score', accuracy_score(test_label, message_predicted_target))  
classifier = classifier.fit(train_data, train_label)
score=classifier.score(test_data, test_label)
test_label.value_counts()

Accuracy Score 0.8566308243727598


ham     478
spam     80
dtype: int64

In [28]:
from sklearn.metrics import confusion_matrix
print('Confusion Matrix \n', confusion_matrix(test_label,message_predicted_target))

Confusion Matrix 
 [[478   0]
 [ 80   0]]


In [29]:
from sklearn.metrics import classification_report
target_names = ['spam', 'ham']
print(classification_report(test_label, message_predicted_target, target_names=target_names))

              precision    recall  f1-score   support

        spam       0.86      1.00      0.92       478
         ham       0.00      0.00      0.00        80

    accuracy                           0.86       558
   macro avg       0.43      0.50      0.46       558
weighted avg       0.73      0.86      0.79       558



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### END