In [1]:
pip install numpy pandas scikit-learn nltk



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Load the dataset
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

# Convert labels to binary values (spam = 1, ham = 0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Display the first few rows
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'SMSSpamCollection'

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the messages
df['message'] = df['message'].apply(preprocess_text)

# Display the first few rows after preprocessing
print(df.head())

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
def predict_spam(sms):
    # Preprocess the input SMS
    sms_processed = preprocess_text(sms)

    # Transform the processed SMS using the TF-IDF vectorizer
    sms_tfidf = tfidf_vectorizer.transform([sms_processed])

    # Predict
    prediction = model.predict(sms_tfidf)

    return "Spam" if prediction[0] == 1 else "Ham"

# Test with a custom SMS
test_sms = "Congratulations! You've won a $1000 Walmart gift card. Click here to claim now."
print(predict_spam(test_sms))