In [2]:
import pandas as pd
messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
messages.shape

(5572, 2)

Text Preprocessing --> Tokenization, Stopwords, Stemming, Lemmatization, NLTR


Text->vectors --> BOW, TF-IDF, Word2Vec, AvgWord2Vec

In [11]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kuki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #used for stemming purpose
ps = PorterStemmer()

In [21]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [53]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply

In [24]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary= True, ngram_range = (2,2))
X = cv.fit_transform(corpus).toarray()

In [25]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [29]:
#converting into T/F
y=pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [30]:
y

array([False, False,  True, ..., False, False, False])

In [31]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [32]:
from sklearn.naive_bayes import MultinomialNB 
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [33]:

#prediction
y_pred=spam_detect_model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.9721973094170404


In [35]:

from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       986
        True       0.81      1.00      0.89       129

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [37]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [38]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [39]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [40]:
#prediction
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)


0.9811659192825112


In [41]:

from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       976
        True       0.87      1.00      0.93       139

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [42]:
from sklearn.ensemble import RandomForestClassifier
classifier= RandomForestClassifier()
classifier.fit(X_train, y_train)

In [44]:
#prediction
y_pred=classifier.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

0.9820627802690582


In [45]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       973
        True       0.88      0.99      0.93       142

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115




Word2vec Implementation 

In [48]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [50]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [54]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [56]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess #convert document into list of lowercase tokens

In [99]:
words=[]
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [100]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [101]:
import gensim

In [102]:
model=gensim.models.Word2Vec(words, window=5, min_count=2)

In [103]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'make',
 'dear',
 'night',
 'message',
 'well',
 'say',
 'min',
 'thing',
 'much',
 'great',
 'claim',
 'hope',
 'oh',
 'hey',
 'give',
 'number',
 'happy',
 'friend',
 'work',
 'wat',
 'way',
 'yes',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'said',
 'win',
 'amp',
 'life',
 'cash',
 'yeah',
 'im',
 'tone',
 'really',
 'babe',
 'meet',
 'find',
 'miss',
 'morning',
 'uk',
 'last',
 'service',
 'thanks',
 'care',
 'com',
 'would',
 'anything',
 'year',
 'also',
 'lol',
 'nokia',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'urgent',
 'contact',
 'sent',


In [104]:
model.corpus_count

5565

In [105]:
model.epochs

5

In [106]:
model.wv.similar_by_word('prize')

[('claim', 0.9994116425514221),
 ('call', 0.9993017911911011),
 ('cash', 0.9991666078567505),
 ('line', 0.9991281628608704),
 ('show', 0.999057412147522),
 ('contact', 0.9990460872650146),
 ('number', 0.9990404844284058),
 ('guaranteed', 0.9990313649177551),
 ('service', 0.9990031123161316),
 ('urgent', 0.9989963173866272)]

Avg_word2vec

In [107]:
import numpy as np

In [108]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)
        

In [109]:
!pip install tqdm



In [110]:
from tqdm import tqdm
words[100]

['please', 'text', 'anymore', 'nothing', 'else', 'say']

In [111]:
type(model.wv.index_to_key)
list

list

In [112]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|█████████████████████████████████████| 5565/5565 [00:01<00:00, 3471.53it/s]


In [113]:
type(X)

list

In [123]:
words[1]

['ok', 'lar', 'joking', 'wif', 'oni']

In [125]:
X #input feature
y #output features

array([False, False,  True, ..., False, False, False])