In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

In [3]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('./Resources/SMSSpamCollection.csv')

sms_text_df.head()

Unnamed: 0,label,text_message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [4]:
# Check for missing values. 
sms_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         5572 non-null   object
 1   text_message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
sms_text_df.isna().sum()

label           0
text_message    0
dtype: int64

In [6]:
#  Get the number of "ham" and "spam" from the "label" column:
sms_text_df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

## Split the data into train & test sets:

In [7]:
# Set the features variable to the text message. 

# Set the target variable to the "label" column.


# Split data into training and testing and set the test_size = 33%

X = sms_text_df['text_message']
y = sms_text_df['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
# Create an instance of the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Transform the data and use the original X_train set.
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

(3733, 6823)

In [9]:
# What stopwords are in the scikit-learn's built-in list
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

frozenset({'cannot', 'sixty', 'fire', 'his', 'yourself', 'hence', 'are', 'everywhere', 'empty', 'latter', 'almost', 'himself', 'why', 'whereas', 'beyond', 'call', 'should', 'thru', 'these', 'still', 'former', 'ten', 'below', 'were', 'hasnt', 'somewhere', 'whereafter', 'under', 'off', 'much', 'was', 'hundred', 'whom', 'due', 'there', 'inc', 'own', 'nothing', 'yours', 'over', 'somehow', 'more', 'except', 'put', 'system', 'toward', 'made', 'a', 'cant', 'whither', 'nine', 'down', 'otherwise', 'two', 'by', 'bottom', 'via', 'with', 'nor', 'upon', 'may', 'not', 'in', 'herself', 'last', 'next', 'along', 'up', 'throughout', 'but', 'also', 'even', 'if', 'that', 'would', 'five', 'none', 'yourselves', 'afterwards', 'part', 'here', 'mill', 'has', 'few', 'out', 'thereafter', 'further', 'rather', 'the', 'per', 'meanwhile', 'your', 'about', 'latterly', 'thereby', 'fifty', 'herein', 'get', 'four', 'amount', 'my', 'they', 'do', 'least', 'whenever', 'must', 'nowhere', 'whereupon', 'so', 'until', 'please'

In [10]:
# Create a list to hold the words using the vectorizer.get_feature_names_out()
words = list(vectorizer.get_feature_names_out())

# Create a list to hold the frequency using np.ravel(X.sum(axis=0))
frequency = list(np.ravel(X_train_tfidf.sum(axis=0)))

In [11]:
# Create a DataFrame of the TF–IDF weights for each word in the working corpus.
messages_df = pd.DataFrame({
    "Word": words,
    "Frequency": frequency})


# Sort the DataFrame by word frequency in descending order and reset the index.
messages_df = messages_df.sort_values(by=["Frequency"], ascending=False).reset_index(drop=True)


# Display the first 10 rows of the DataFrame
messages_df.head(10)

Unnamed: 0,Word,Frequency
0,ok,69.321087
1,ll,54.283266
2,just,50.524804
3,ur,44.971124
4,come,44.65209
5,gt,44.16301
6,lt,44.05211
7,good,42.019014
8,time,39.63344
9,know,38.78232


In [12]:
# Display the last 10 rows of the DataFrame
messages_df.tail(10)

Unnamed: 0,Word,Frequency
6813,sympathetic,0.074647
6814,healer,0.074647
6815,dependable,0.074647
6816,determined,0.074647
6817,venaam,0.074647
6818,driver,0.074647
6819,aaniye,0.074647
6820,stylist,0.074647
6821,exterminator,0.074647
6822,pest,0.074647


In [13]:
X_train_tfidf

<3733x6823 sparse matrix of type '<class 'numpy.float64'>'
	with 29406 stored elements in Compressed Sparse Row format>

In [14]:
# Train the data on LinearSVC classifier.
linear_svc_model = LinearSVC()


# Fit the model to the transformed  data,
linear_svc_model.fit(X_train_tfidf, y_train)



In [15]:
X_test

3245    Squeeeeeze!! This is christmas hug.. If u lik my frndshp den hug me back.. If u get 3 u r cute:) 6 u r luvd:* 9 u r so lucky;) None? People hate u:
944                                    And also I've sorta blown him off a couple times recently so id rather not text him out of the blue looking for weed
1044                                                 Mmm thats better now i got a roast down me! id b better if i had a few drinks down me 2! Good indian?
2484                                                                                                          Mm have some kanji dont eat anything heavy ok
812                                        So there's a ring that comes with the guys costumes. It's there so they can gift their future yowifes. Hint hint
                                                                               ...                                                                         
4944                                                        Chec

In [16]:
# Determine predictions.  
predictions = linear_svc_model.predict(X_test)

ValueError: could not convert string to float: 'Squeeeeeze!! This is christmas hug.. If u lik my frndshp den hug me back.. If u get 3 u r cute:) 6 u r luvd:* 9 u r so lucky;) None? People hate u:'

In [18]:
# Transform the testing data like we did with the training data.
X_test_tfidf = vectorizer.transform(X_test)

# Make predictions 
predictions = linear_svc_model.predict(X_test_tfidf)
print(predictions[:50])

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam'
 'ham' 'ham']


In [19]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % linear_svc_model.score(X_train_tfidf, y_train))
print('Test Accuracy: %.3f' % linear_svc_model.score(X_test_tfidf, y_test))

Train Accuracy: 1.000
Test Accuracy: 0.989


## For efficiency,  build a Pipeline with the vectorizer and SVM model. 

In [21]:
# Build a pipeline to transform the test set to compare to the training set. 
text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', LinearSVC()),
])

# Fit the model to the transformed data.
text_clf.fit(X_train, y_train)



In [22]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % text_clf.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf.score(X_test, y_test))

Train Accuracy: 1.000
Test Accuracy: 0.989


## Test the classifier and display results

In [23]:
# Form a prediction set
message_predictions = text_clf.predict(X_test)
print(message_predictions[0:30])

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'spam' 'ham' 'ham' 'ham' 'ham' 'ham']


In [24]:
# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,message_predictions))

# Print a classification report
print(metrics.classification_report(y_test,message_predictions))


# Print the overall accuracy
print(metrics.accuracy_score(y_test,message_predictions))

[[1587    6]
 [  15  231]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.94      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.97      1839
weighted avg       0.99      0.99      0.99      1839

0.9885807504078303


In [25]:
# Create some random text messages. 
text_1 = """You win $50!"""
text_2 = """Hi, my name is Brad!!"""
text_3 = """Talk to the single women in your area now!"""
text_4 = """How are u?"""

In [26]:
print(text_clf.predict([text_1]))
print(text_clf.predict([text_2]))
print(text_clf.predict([text_3]))
print(text_clf.predict([text_4]))

['spam']
['ham']
['ham']
['ham']


In [20]:
# Send the text messages to transform the data and predict the classification.


['ham']
['ham']
['spam']
['spam']
