In [4]:
#Loading the Dataset

import pandas as pd
df = pd.read_csv("data/SMSSpamCollection", sep='\t', names=["label", "text"])
df.head()
print("Dataset shape", df.shape)
print(df['label'].value_counts())

Dataset shape (5572, 2)
label
ham     4825
spam     747
Name: count, dtype: int64


In [14]:
#Step 2: Basic Text Cleaning
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '',text)
    text = text.translate(str.maketrans('','',string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text']= df['text'].apply(clean_text)
df.head()


Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [16]:
#Step 3: Tokenization & Stopword Removal
import  nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def tok(text):
    tokens = word.tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Purnima\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Purnima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
#Step 4: Feature Extraction with BoW & TF-IDF
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['clean_text'])
print("Shape of Bow matrix:", X_bow.shape)
print("First 20 features:", bow_vectorizer.get_feature_names_out()[:20])
print( )
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
print("Shape of tf-idf matrix:", X_tfidf.shape)
print("First 20 features:", tfidf_vectorizer.get_feature_names_out()[:20])


Shape of Bow matrix: (5572, 8608)
First 20 features: ['aa' 'aah' 'aaniye' 'aaooooright' 'aathilove' 'aathiwhere' 'ab' 'abbey'
 'abdomen' 'abeg' 'abelu' 'aberdeen' 'abi' 'ability' 'abiola' 'abj' 'able'
 'abnormally' 'about' 'aboutas']
Shape of tf-idf matrix: (5572, 8608)
First 20 features: ['aa' 'aah' 'aaniye' 'aaooooright' 'aathilove' 'aathiwhere' 'ab' 'abbey'
 'abdomen' 'abeg' 'abelu' 'aberdeen' 'abi' 'ability' 'abiola' 'abj' 'able'
 'abnormally' 'about' 'aboutas']
