In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.1-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.1-cp310-cp310-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB 1.4 MB/s eta 0:00:09
   - -------------------------------------- 0.4/11.6 MB 6.9 MB/s eta 0:00:02
   --- ------------------------------------ 1.0/11.6 MB 8.9 MB/s eta 0:00:02
   ----- ---------------------------------- 1.7/11.6 MB 10.8 MB/s eta 0:00:01
   ------- -------------------------------- 2.2/11.6 MB 11.5 MB/s eta 0:00:01
   -------- ------------------------------- 2.5/11.6 MB 10.0 MB/s eta 0:00:01
   --------- ------------------------------ 2.8/11.6 MB 9.8 MB/s eta 0:00:01
   ---------- --------------

In [3]:
import pandas as pd

In [11]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names = ['label', 'message'])

In [12]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [13]:
messages.shape

(5572, 2)

In [31]:
messages['message'].loc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [21]:
#data clearning and preprocessing

In [22]:
import re
import nltk

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nltk.corpus import stopwords

In [24]:
from nltk.stem.porter import PorterStemmer

In [25]:
ps = PorterStemmer()

In [28]:
corpus=[]
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
 

In [29]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [43]:
#creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [44]:
X[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
y = pd.get_dummies(messages['label'], drop_first=True, dtype=int)

In [39]:
y

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [47]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [48]:
#prediction
y_pred = spam_detect_model.predict(X_test)

In [49]:
from sklearn.metrics import accuracy_score, classification_report,

In [50]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9748878923766816


In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       0.99      0.82      0.90       149

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [52]:
#creating TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [55]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [56]:
y_pred = spam_detect_model.predict(X_test)

In [57]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9811659192825112


In [58]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [59]:
# word to vec implementation

In [60]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [61]:
corpus=[]
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
 

In [62]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [63]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [64]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [65]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [66]:
import gensim

In [67]:
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [68]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [69]:
model.corpus_count

5564

In [70]:
model.epochs

5

In [75]:
model.wv.similar_by_word('happy')

[('year', 0.999488353729248),
 ('day', 0.9994272589683533),
 ('love', 0.9993733167648315),
 ('make', 0.9993678331375122),
 ('amp', 0.9993649125099182),
 ('hello', 0.9993640780448914),
 ('find', 0.9993415474891663),
 ('money', 0.9993410706520081),
 ('dont', 0.9993320107460022),
 ('much', 0.9993245601654053)]

In [76]:
import numpy as np

In [77]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [78]:
from tqdm import tqdm

In [80]:
words[73]

['performed']

In [81]:
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|████████████████████████████████████████████████████████████████████████████| 5564/5564 [00:00<00:00, 6265.59it/s]


In [82]:
type(X)

list

In [85]:
X[0].shape

(100,)

In [86]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [96]:
X[1]

array([-6.07691817e-02,  1.87430039e-01,  9.37701464e-02,  1.69532672e-02,
        3.90927009e-02, -2.14942902e-01,  5.28601333e-02,  3.52325410e-01,
       -1.11518934e-01, -9.25379023e-02, -1.03571773e-01, -2.41017371e-01,
       -2.66625639e-02,  8.82903188e-02,  5.30855656e-02, -1.81096107e-01,
        1.58903003e-02, -2.30178684e-01, -5.65655250e-03, -2.90191650e-01,
        5.99237867e-02,  1.02802709e-01,  5.47424853e-02, -7.77570382e-02,
       -6.42583072e-02,  8.01113807e-03, -1.35543913e-01, -1.01066984e-01,
       -1.64461091e-01,  2.34118756e-02,  2.04893306e-01,  2.68696044e-02,
        8.96503031e-02, -1.77539200e-01, -8.38989168e-02,  1.91607609e-01,
        1.10766171e-02, -1.54776365e-01, -1.06064439e-01, -2.85162657e-01,
        3.03644389e-02, -1.54783756e-01, -4.48190942e-02,  4.57230210e-02,
        1.64559439e-01, -9.11879763e-02, -1.31567150e-01,  7.87652098e-03,
        8.35409388e-02,  1.59232736e-01,  1.04237281e-01, -1.88382894e-01,
       -4.12345603e-02, -