## Spam-Ham Classifier using BOW, TF-IDF, Word2Vec and Average Word2Vec

### Importing the Dataset

In [1]:
import pandas as pd
import numpy as np

messages = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\Machine Learning\NLP\SMSSpamCollection.txt",
                       sep='\t',names=["label", "message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
messages.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
messages.shape

(5572, 2)

#### Displaying a record out of the dataset

In [5]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [6]:
messages['message'].loc[5122]

'NOT ENUFCREDEIT TOCALL.SHALL ILEAVE UNI AT 6 +GET A BUS TO YOR HOUSE?'

### Data Cleaning and Pre-processing
This step includes applying tokenization (already done in this data as all records are already in the form of sentences) , removing stopwords, stemming or lemmatization or both

#### Importing libraries for cleaning

In [7]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [9]:
corpus = []
for i in range(0,len(messages)):
    
    review = re.sub('[^a-zA-Z0-9]',' ', messages['message'][i]) # replaces all other characters except a-z,A-Z and 0-9 with ' '   
    review = review.lower()                                    
    review = review.split()
                    
    review = [stemmer.stem(word) for word in review             # does stemming on the words which are not stopwords
              if not word in stopwords.words('english')]      
    review = ' '.join(review)                                      
    corpus.append(review)

In [10]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

### Bag of Words
We import and use Count Vectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500, binary = True, ngram_range = (2, 2))   # max_features is a hyperparameter which tells 
                                                                           # the model to take the top 'n' maximum features

In [12]:
X = cv.fit_transform(corpus).toarray()

In [13]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
X[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
X.shape

(5572, 2500)

In [16]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [17]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

#### Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

Selecting a model (Focus here is to learn about NLP and not about algorithms)

In [19]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

#### Prediction

In [20]:
y_pred = spam_detect_model.predict(X_test)

#### Finding accuracy report

In [21]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.9721973094170404


In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       955
           1       1.00      0.81      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



### TF-IDF
We import and use TfidfVectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features = 2500, ngram_range = (1,2))

In [24]:
X = tf.fit_transform(corpus).toarray()

In [25]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Train-Test Split

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [27]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

#### Prediction and Accuracy Report

In [28]:
y_pred = spam_detect_model.predict(X_test)

In [29]:
print(accuracy_score(y_test, y_pred))

0.9811659192825112


In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.87      0.93       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Word2Vec
We prefer lemmatization while working with Word2Vec as it gives us meaningful words which further helps generate useful vectors

In [31]:
!pip install gensim



In [32]:
from gensim.models import Word2Vec, KeyedVectors

In [33]:
import gensim.downloader as api

#### In order to use Google's Pre-Trained Model

In [34]:
wv = api.load('word2vec-google-news-300')

##### We obtain a '300' dimension vector for whatever word we want

In [35]:
wv['king']

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

#### Lemmatization

In [36]:
from nltk.stem import WordNetLemmatizer

In [37]:
lemma = WordNetLemmatizer()

In [38]:
corpus = []
for i in range(0,len(messages)):
    
    review = re.sub('[^a-zA-Z0-9]',' ', messages['message'][i]) # replaces all other characters except a-z,A-Z and 0-9 with ' '   
    review = review.lower()                                    
    review = review.split()
                    
    review = [lemma.lemmatize(word) for word in review          # does lemmatization on the words which are not stopwords
              if not word in stopwords.words('english')]      
    review = ' '.join(review)                                      
    corpus.append(review)

In [39]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply

In [40]:
len(corpus)

5572

#### To create our model from scratch

In [41]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess      # Converts a document into a list of lowercase tokens.
                                                # One can define min_len and max_len to ignore too short and too long tokens

In [42]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [43]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [44]:
len(words)

5565

In [45]:
unique = []
for i in words:
    if i not in unique:
        unique.append(i)

In [46]:
unique

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [47]:
len(unique)

5044

We have achieved the tokenized and lemmatized words, now we import gensim and start working on our model

In [48]:
import gensim

#### Training the model on 'words' with window size = '5' and minimum count = 2

In [49]:
model = Word2Vec(words, window = 5)

In [50]:
wv

<gensim.models.keyedvectors.KeyedVectors at 0x2273642b880>

In [51]:
model.wv.index_to_key           # the vocabulary

['call',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'free',
 'day',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'love',
 'text',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'dear',
 'make',
 'night',
 'message',
 'say',
 'well',
 'min',
 'thing',
 'much',
 'oh',
 'great',
 'hope',
 'claim',
 'hey',
 'number',
 'give',
 'happy',
 'wat',
 'work',
 'friend',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'tone',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'service',
 'uk',
 'thanks',
 'last',
 'care',
 'anything',
 'com',
 'would',
 'year',
 'also',
 'nokia',
 'lol',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'contact',
 'urgent',
 'sent',


In [52]:
model.corpus_count

5565

In [53]:
model.epochs

5

In [54]:
model.wv.similar_by_word('love')

[('keep', 0.9996082782745361),
 ('need', 0.999579906463623),
 ('day', 0.9995640516281128),
 ('much', 0.9995595812797546),
 ('amp', 0.9995548129081726),
 ('smile', 0.9995536208152771),
 ('life', 0.9995400309562683),
 ('make', 0.9995380640029907),
 ('always', 0.9995229244232178),
 ('thing', 0.9995189905166626)]

In [55]:
model.wv.similar_by_word('prize')

[('claim', 0.9991765022277832),
 ('call', 0.9990848898887634),
 ('guaranteed', 0.9990234971046448),
 ('cash', 0.9988906979560852),
 ('line', 0.9987654685974121),
 ('awarded', 0.9987616539001465),
 ('draw', 0.9987483620643616),
 ('service', 0.9986354112625122),
 ('mobile', 0.9985932111740112),
 ('contact', 0.9985644221305847)]

In [56]:
model.wv.similar_by_word('hope')

[('day', 0.9996525645256042),
 ('amp', 0.9996298551559448),
 ('like', 0.9996296167373657),
 ('way', 0.9996170401573181),
 ('give', 0.999613344669342),
 ('much', 0.9996010065078735),
 ('need', 0.9995979070663452),
 ('keep', 0.9995937943458557),
 ('make', 0.9995915293693542),
 ('thing', 0.9995819330215454)]

In [57]:
model.wv['kid'].shape               # shows that each word has 100 dimensions

(100,)

We define a function for Average Word2Vec. We find mean of all those words which are in a sentence - 'doc' but NOT IN the vocabulary

In [58]:
def Average_Word2Vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis = 0)

In [59]:
!pip install tqdm
from tqdm import tqdm



In [60]:
words[69]

['plane', 'give', 'month', 'end']

In [61]:
awv = []
for i in tqdm(range(len(words))):
    awv.append(Average_Word2Vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|███████████████████████████████████████████████████████████████████████████| 5565/5565 [00:00<00:00, 10008.00it/s]


In [62]:
awv

[array([-0.17805149,  0.2474637 ,  0.06979417,  0.03219411,  0.10955417,
        -0.3173299 , -0.00434382,  0.49624947, -0.21122207, -0.13916217,
        -0.15651804, -0.32356003,  0.06025432,  0.05587566,  0.05055177,
        -0.23589455,  0.00921403, -0.33378696, -0.00372371, -0.4756087 ,
         0.11205225,  0.15337282,  0.05904363, -0.15142266, -0.17196721,
        -0.02864236, -0.2194919 , -0.18742563, -0.25063476,  0.06379043,
         0.2204125 ,  0.01170792,  0.03599829, -0.06131874, -0.08938952,
         0.2246539 , -0.08168623, -0.20927149, -0.26404464, -0.45131397,
        -0.00292932, -0.23730698, -0.00116666,  0.09394991,  0.2030373 ,
        -0.12691796, -0.1893354 , -0.03271213,  0.12605077,  0.12215824,
         0.12705101, -0.16013843, -0.03117585, -0.01649953, -0.12142933,
         0.13778573,  0.06229898, -0.00543405, -0.33767435,  0.01990523,
         0.0627129 ,  0.07850332, -0.06250737, -0.05583971, -0.39186162,
         0.19167718,  0.11947274,  0.24083742, -0.3

In [63]:
len(awv)

5565

In [64]:
len(words)

5565

In [65]:
awv_new = np.array(awv)

  awv_new = np.array(awv)


In [66]:
awv_new[0]

array([-0.17805149,  0.2474637 ,  0.06979417,  0.03219411,  0.10955417,
       -0.3173299 , -0.00434382,  0.49624947, -0.21122207, -0.13916217,
       -0.15651804, -0.32356003,  0.06025432,  0.05587566,  0.05055177,
       -0.23589455,  0.00921403, -0.33378696, -0.00372371, -0.4756087 ,
        0.11205225,  0.15337282,  0.05904363, -0.15142266, -0.17196721,
       -0.02864236, -0.2194919 , -0.18742563, -0.25063476,  0.06379043,
        0.2204125 ,  0.01170792,  0.03599829, -0.06131874, -0.08938952,
        0.2246539 , -0.08168623, -0.20927149, -0.26404464, -0.45131397,
       -0.00292932, -0.23730698, -0.00116666,  0.09394991,  0.2030373 ,
       -0.12691796, -0.1893354 , -0.03271213,  0.12605077,  0.12215824,
        0.12705101, -0.16013843, -0.03117585, -0.01649953, -0.12142933,
        0.13778573,  0.06229898, -0.00543405, -0.33767435,  0.01990523,
        0.0627129 ,  0.07850332, -0.06250737, -0.05583971, -0.39186162,
        0.19167718,  0.11947274,  0.24083742, -0.32273534,  0.27

#### Retracing the blank sentences ignored while lemmatization as they included ALL stopwords

In [67]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', 'What you doing?how are you?'],
 [0, '', 'Where @'],
 [0, '', 'Can a not?'],
 [0, '', ':) '],
 [0, '', 'What you doing?how are you?'],
 [0, '', ':( but your not here....'],
 [0, '', ':-) :-)']]

#### Removing these empty rows from the 'y' variable intially

In [68]:
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y = pd.get_dummies(y['label'])
y = y.iloc[:,1].values

y.shape

(5565,)

#### Train - Test Split

In [69]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(awv_new,y,test_size=0.2,random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
accuracy_score(y_test,y_pred)