In [1]:
# Read the dataset with latin1 encoding
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin1')

# Display basic information
print("Dataset Shape:", df.shape)
print("\
First few examples:")
print(df.head())
print("\
Class distribution:")
print(df['v1'].value_counts())
print("\
Columns in dataset:")
print(df.columns.tolist())

Dataset Shape: (5572, 5)
First few examples:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Class distribution:
v1
ham     4825
spam     747
Name: count, dtype: int64
Columns in dataset:
['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']


In [3]:
# Cleaning the DataFrame
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Keep only the two relevant columns
spam_df = df[['v1', 'v2']].copy()
spam_df.columns = ['label', 'message']

print('After cleaning, dataset shape:', spam_df.shape)
print(spam_df.head())

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(spam_df['message'], spam_df['label'], test_size=0.2, random_state=42, stratify=spam_df['label'])

print('\
Train set size:', X_train.shape[0])
print('Test set size:', X_test.shape[0])

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print('\
TF-IDF vectorization complete.')

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

print('\
Model training complete.')

# Evaluate the model
y_pred = clf.predict(X_test_tfidf)
print('\
Classification Report:')
print(classification_report(y_test, y_pred))

print('\
Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('done')

After cleaning, dataset shape: (5572, 2)
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Train set size: 4457
Test set size: 1115
TF-IDF vectorization complete.
Model training complete.
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
[[966   0]
 [ 35 114]]
done


In [5]:
# Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# We'll use NLTK for stemming
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer

# Define a text cleaning function with stemming
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Stem the tokens using Porter Stemmer
    ps = PorterStemmer()
    stemmed = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed)

# Clean the messages column
spam_df['clean_message'] = spam_df['message'].apply(clean_text)

print('Cleaned sample messages:')
print(spam_df[['message', 'clean_message']].head())

# Split into train/test using the clean_message column
X_train, X_test, y_train, y_test = train_test_split(spam_df['clean_message'], spam_df['label'], test_size=0.2, random_state=42, stratify=spam_df['label'])

# Vectorize using TF-IDF on cleaned messages
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes model
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_tfidf)

print('\
Classification Report:')
print(classification_report(y_test, y_pred))

print('\
Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('done')

[nltk_data] Downloading package punkt to C:\Users\Rahul
[nltk_data]     Chauhan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Cleaned sample messages:
                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                       clean_message  
0  go until jurong point crazi avail onli in bugi...  
1                              ok lar joke wif u oni  
2  free entri in a wkli comp to win fa cup final ...  
3        u dun say so earli hor u c alreadi then say  
4  nah i don t think he goe to usf he live around...  
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.99      0.75      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg 