In [None]:
%pip install pandas scikit-learn nltk

In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

print("Step 1: Loading and inspecting data...")
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

print("Data loaded successfully. Here's the first 5 rows:")
print(df.head())
print("\nData Info:")
df.info()

# Map labels to numerical values: ham=0, spam=1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print("\nLabels mapped to 0 (ham) and 1 (spam).")


print("\nStep 2: Preprocessing text data...")
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

df['processed_message'] = df['message'].apply(preprocess_text)
print("Text preprocessing complete. A new 'processed_message' column has been added.")


print("\nStep 3: Creating features with TF-IDF...")
X = df['processed_message']
y = df['label']

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)
print(f"Data has been converted into a TF-IDF matrix of shape: {X_tfidf.shape}")


print("\nStep 4: Splitting data and training the Naive Bayes model...")
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
print("Model training complete.")


print("\nStep 5: Evaluating the model performance...")
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)


print("\nStep 6: Testing the model with new, unseen messages...")

def predict_message(message):
    # Preprocess the message
    processed_message = preprocess_text(message)
    # Transform using the SAME fitted vectorizer
    message_tfidf = vectorizer.transform([processed_message])
    # Predict
    prediction = model.predict(message_tfidf)[0]
    # Return the result
    return "Spam" if prediction == 1 else "Ham (Not Spam)"

spam_message = "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/claim-your-prize to claim now."
ham_message = "Hey, are we still on for the meeting tomorrow at 2 PM?"

print(f"\nMessage: '{spam_message}'")
print(f"Prediction: {predict_message(spam_message)}")

print(f"\nMessage: '{ham_message}'")
print(f"Prediction: {predict_message(ham_message)}")

Step 1: Loading and inspecting data...
Data loaded successfully. Here's the first 5 rows:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB

Labels mapped to 0 (ham) and 1 (spam).

Step 2: Preprocessing text data...
Text preprocessing complete. A new 'processed_message' column has been added.

Step 3: Creating features with TF-IDF...
Data has been converted into a TF-IDF matrix of shape

In [2]:
df.head()

Unnamed: 0,label,message,processed_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt may ...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              5572 non-null   int64 
 1   message            5572 non-null   object
 2   processed_message  5572 non-null   object
dtypes: int64(1), object(2)
memory usage: 130.7+ KB
