<a href="https://colab.research.google.com/github/pyro1152/sms_spam_detector/blob/main/sms_text_classification_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

In [26]:
sms_text_df = pd.read_csv('SMSSpamCollection.csv')

In [27]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('SMSSpamCollection.csv')
sms_text_df.head()

Unnamed: 0,label,text_message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [28]:
# Check for missing values.
sms_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         5572 non-null   object
 1   text_message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [29]:
#  Get the number of "ham" and "spam" from the "label" column:
sms_text_df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

## Split the data into train & test sets:

In [30]:
# Set the features variable to the text message.
X = sms_text_df['text_message']
# Set the target variable to the "label" column.
y = sms_text_df['label']

# Split data into training and testing and set the test_size = 33%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Build a Pipeline with the vectorizer and SVM model.

In [31]:
# Build a pipeline to transform the test set to compare to the training set.
text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', LinearSVC()),
])

# Fit the model to the transformed data.
text_clf.fit(X_train, y_train)

In [32]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % text_clf.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf.score(X_test, y_test))

Train Accuracy: 1.000
Test Accuracy: 0.989


## Test the classifier and display results

In [33]:
# Create some random text messages.
text_1 = """You are a lucky winner of $5000!!"""
text_2 = """You won 2 free tickets to the Super Bowl."""
text_3 = """You won 2 free tickets to the Super Bowl text us to claim your prize"""
text_4 = """Thanks for registering. Text 4343 to receive free updates on medicare"""

In [34]:
# Send the text messages to transform the data and predict the classification.
print(text_clf.predict([text_1]))
print(text_clf.predict([text_2]))
print(text_clf.predict([text_3]))
print(text_clf.predict([text_4]))

['ham']
['ham']
['spam']
['spam']


In [35]:
def sms_classification(X_train, y_train):
    # Build a pipeline to transform the text data and apply LinearSVC
    text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', LinearSVC())
    ])

    # Fit the model to the transformed training data
    text_clf.fit(X_train, y_train)

    return text_clf

In [37]:
sms_text_df = pd.read_csv('SMSSpamCollection.csv')
sms_text_df.head()
X = sms_text_df['text_message']
y = sms_text_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Build the pipeline and fit the model
text_clf = sms_classification(X_train, y_train)

# Check the model accuracy
print('Train Accuracy: %.3f' % text_clf.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf.score(X_test, y_test))

Train Accuracy: 1.000
Test Accuracy: 0.989


In [38]:
# New text for prediction
new_text = "Congratulations! You've won a free vacation. Click here to claim your prize."

# Make a prediction on the new text
prediction = text_clf.predict([new_text])

# Print the prediction
print("Prediction for the new text:", prediction)

Prediction for the new text: ['spam']


In [39]:
def sms_prediction(text):
    # Make a prediction on the text
    prediction = text_clf.predict([text])[0]  # Assuming 'text_clf' is the trained model

    # Check if the prediction is 'ham' or 'spam' and return the appropriate message
    if prediction == 'ham':
        return f'The text message: "{text}", is not spam.'
    elif prediction == 'spam':
        return f'The text message: "{text}", is spam.'
    else:
        return "Unable to classify the text message."

# Example usage
new_text = "You have won a prize! Claim now."
prediction_result = sms_prediction(new_text)
print(prediction_result)

The text message: "You have won a prize! Claim now.", is spam.


In [41]:
!pip install gradio
import gradio as gr

# Define the function for text classification
def sms_prediction(text):
    # Make a prediction on the text
    prediction = text_clf.predict([text])[0]  # Assuming 'text_clf' is the trained model

    # Check if the prediction is 'ham' or 'spam' and return the appropriate message
    if prediction == 'ham':
        return f'The text message: "{text}", is not spam.'
    elif prediction == 'spam':
        return f'The text message: "{text}", is spam.'
    else:
        return "Unable to classify the text message."

# Create a Gradio Interface application
app = gr.Interface(fn=sms_prediction,
                    inputs=gr.Textbox(lines=5, label="Enter the text message to classify", placeholder="Type your message here..."),
                    outputs=gr.Textbox(label="Classification Result", type="text", lines=5))

# Launch the Gradio Interface application
app.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://bfc138ca06b20f9bf3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


