### BOW implementation using NLTK

In [1]:
import pandas as pd
df = pd.read_csv("spam.csv",encoding="latin1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [3]:
# Rename 'v1' to 'label' and 'v2' to 'Message'
df.rename(columns={'v1': 'label', 'v2': 'Message'}, inplace=True)

# Drop the unwanted columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

# Display the updated DataFrame
print(df.head())

  label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
df.head()

Unnamed: 0,label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Data Cleaning and Preprocessing

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lz = WordNetLemmatizer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer('english')
from nltk.stem import PorterStemmer
pt = PorterStemmer()

In [7]:
corpus = []
for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['Message'][i])
    review = review.lower()
    review =review.split()
    review = [lz.lemmatize(word)for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

### Create the Bag of Words (BOW) Model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 100,binary = True)

In [10]:
# Fit to the corpus
cv.fit(corpus)

In [11]:
# Access the vocabulary
cv.vocabulary_  # Vocabulary as a dictionary

{'go': 23,
 'great': 27,
 'got': 26,
 'wat': 90,
 'ok': 59,
 'free': 19,
 'win': 94,
 'text': 79,
 'txt': 86,
 'say': 70,
 'already': 0,
 'think': 82,
 'life': 39,
 'hey': 30,
 'week': 92,
 'back': 5,
 'like': 40,
 'still': 75,
 'send': 72,
 'friend': 20,
 'prize': 65,
 'claim': 9,
 'call': 6,
 'mobile': 50,
 'co': 10,
 'home': 32,
 'want': 89,
 'today': 84,
 'cash': 8,
 'day': 15,
 'reply': 67,
 'www': 96,
 'right': 68,
 'take': 77,
 'time': 83,
 'message': 47,
 'com': 11,
 'oh': 58,
 'yes': 99,
 'make': 45,
 'way': 91,
 'dont': 17,
 'miss': 49,
 'ur': 88,
 'going': 24,
 'da': 14,
 'lor': 42,
 'meet': 46,
 'really': 66,
 'know': 35,
 'lol': 41,
 'love': 43,
 'let': 38,
 'work': 95,
 'yeah': 97,
 'tell': 78,
 'anything': 2,
 'thanks': 80,
 'uk': 87,
 'please': 63,
 'msg': 52,
 'see': 71,
 'pls': 64,
 'need': 54,
 'tomorrow': 85,
 'hope': 33,
 'well': 93,
 'lt': 44,
 'gt': 28,
 'get': 21,
 'ask': 3,
 'morning': 51,
 'happy': 29,
 'sorry': 74,
 'give': 22,
 'new': 55,
 'find': 18,
 'year

In [12]:
X = cv.fit_transform(corpus).toarray()

In [13]:
X.shape

(5572, 100)

In [14]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### N-grams

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable Binary = True
cv = CountVectorizer(max_features = 500,binary = True,ngram_range=(3,3))
X = cv.fit_transform(corpus).toarray()

In [16]:
cv.vocabulary_

{'free entry wkly': 142,
 'std txt rate': 391,
 'txt rate apply': 435,
 'set callertune caller': 367,
 'callertune caller press': 51,
 'caller press copy': 49,
 'press copy friend': 332,
 'copy friend callertune': 93,
 'call claim code': 34,
 'entitled update latest': 121,
 'update latest colour': 441,
 'free call mobile': 137,
 'call mobile update': 42,
 'mobile update co': 266,
 'update co free': 440,
 'chance win cash': 62,
 'reply hl info': 346,
 'lt gt inch': 228,
 'like lt gt': 214,
 'bx ip pm': 32,
 'sorry call later': 384,
 'call later meeting': 40,
 'please call customer': 317,
 'call customer service': 35,
 'customer service representative': 102,
 'service representative pm': 366,
 'pm guaranteed cash': 323,
 'guaranteed cash prize': 168,
 'trying contact last': 432,
 'contact last weekend': 86,
 'last weekend draw': 209,
 'weekend draw show': 470,
 'draw show prize': 114,
 'show prize guaranteed': 370,
 'prize guaranteed call': 337,
 'guaranteed call claim': 166,
 'claim cod