# Project

In [2]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection' , sep = '\t', names = ["labels","messages"])

In [3]:
messages

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Data Cleaning and Preprocessing

import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [6]:
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ',messages['messages'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [8]:
# creating bag of words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True,ngram_range=(2,3)) # here max features is used to take top 2500 features
X = cv.fit_transform(corpus).toarray()

In [9]:
y = pd.get_dummies(messages['labels'])
y = y.iloc[:,1].values

In [10]:
# Train Test splitting data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30,random_state=0)

In [11]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [12]:
#prediction
y_pred = spam_detect_model.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score, classification_report

In [14]:
score = accuracy_score(y_test,y_pred)
print(score)

0.9647129186602871


In [15]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1510
           1       0.73      1.00      0.85       162

    accuracy                           0.96      1672
   macro avg       0.87      0.98      0.91      1672
weighted avg       0.97      0.96      0.97      1672



In [16]:
# creating the tfidf model

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [17]:
# Train Test splitting data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20,random_state=0)

In [18]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [19]:
#prediction
y_pred = spam_detect_model.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_test,y_pred)
print(score)

0.9811659192825112


In [21]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       976
           1       0.87      1.00      0.93       139

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Word2Vec Implementation

In [23]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [24]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [26]:
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]', ' ',messages['messages'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [27]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [29]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent)) # simple preprocess is used to clean tokenize data

In [30]:
import gensim

# lets train word2vec from scratch
model = gensim.models.Word2Vec(words,window=5,min_count=2)

In [31]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [32]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    # sent = [word for word in doc if word in model.wv.index_to_key]
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [35]:
from tqdm import tqdm
import numpy as np

In [36]:
# apply for the entire sentences

X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|████████████████████████████████████████████████████████████████████████████| 5564/5564 [00:01<00:00, 3436.38it/s]


In [40]:
X_new = X

In [42]:
X_new[0].shape

(100,)

In [43]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [44]:
X_new

[array([-0.11388449,  0.27379933,  0.10330117, -0.0584742 ,  0.05991709,
        -0.34673193,  0.07382303,  0.51433057, -0.17720318, -0.14508142,
        -0.14448054, -0.33579156, -0.02763964,  0.08804157,  0.07766879,
        -0.21878928,  0.00804675, -0.3207868 ,  0.03803048, -0.45294663,
         0.09575511,  0.0993787 ,  0.1537159 , -0.09826527, -0.0797899 ,
         0.02150709, -0.17432469, -0.1741262 , -0.20361662,  0.06289508,
         0.2955226 ,  0.06303602,  0.1014559 , -0.23637645, -0.12467758,
         0.30159083, -0.01497607, -0.2188887 , -0.16772075, -0.4542265 ,
         0.06622564, -0.24955867, -0.06051652,  0.05943057,  0.26214615,
        -0.11801516, -0.21071072,  0.02650736,  0.12394618,  0.19678684,
         0.18592761, -0.25342894, -0.07007357,  0.006114  , -0.15621702,
         0.19706623,  0.17151348, -0.0130863 , -0.2801714 ,  0.05451986,
         0.09925985,  0.10495058, -0.10881805,  0.0254051 , -0.30219012,
         0.20164944,  0.08558065,  0.16720398, -0.2