In [8]:
# Importing essential libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Spam SMS Classification /spam.csv',encoding = 'latin-1') 

In [10]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [11]:
#Dropping the unnecessary columns

df = df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [12]:
df.rename(columns = {'v1' : 'label', 'v2':'message'}, inplace = True)

In [13]:
# Mapping values for label
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [14]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


**Word2vec Implementation**

In [15]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [16]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [17]:
corpus = []

for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [18]:
corpus[0:3]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply']

In [19]:
len(corpus)

5572

In [20]:
!pip install gensim==4.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.2.0
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 2.1 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [21]:
import gensim

In [22]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [23]:
gensim.__version__

'4.2.0'

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [25]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [26]:
words[:5]

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though']]

In [27]:
len(words)

5564

In [28]:
### Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words,window=5,min_count=2)

In [29]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'home',
 'stop',
 'lor',
 'sorry',
 'see',
 'mobile',
 'still',
 'take',
 'back',
 'da',
 'reply',
 'think',
 'dont',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'pls',
 'please',
 'co',
 'msg',
 'min',
 'make',
 'dear',
 'night',
 'message',
 'say',
 'well',
 'thing',
 'much',
 'claim',
 'great',
 'oh',
 'hope',
 'hey',
 'number',
 'wat',
 'friend',
 'happy',
 'work',
 'give',
 'way',
 'yes',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'said',
 'cash',
 'win',
 'yeah',
 'really',
 'amp',
 'life',
 'babe',
 'im',
 'meet',
 'find',
 'morning',
 'uk',
 'last',
 'miss',
 'thanks',
 'year',
 'service',
 'would',
 'com',
 'also',
 'anything',
 'nokia',
 'lol',
 'care',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'contact',
 'urgent',
 'sent',


In [30]:
model.corpus_count

5564

In [31]:
model.epochs

5

In [32]:
model.wv['happy']

array([-0.2713956 ,  0.37075973,  0.0777479 , -0.04176151,  0.06369221,
       -0.65240175,  0.17004082,  0.851472  , -0.33863044, -0.20195194,
       -0.08830331, -0.7275222 ,  0.02598781,  0.13217232,  0.01989976,
       -0.3589851 ,  0.08357856, -0.5146917 , -0.07616074, -0.77935994,
        0.23344304,  0.17939019,  0.18740985, -0.19669293, -0.09294482,
        0.07704623, -0.26087353, -0.34317788, -0.3959698 ,  0.10268014,
        0.43425098,  0.1809799 ,  0.06065208, -0.32193708, -0.21377519,
        0.46145853, -0.07536908, -0.40328816, -0.19523354, -0.6660124 ,
       -0.00635151, -0.4423179 , -0.03175237, -0.00220347,  0.36297065,
       -0.08566655, -0.21293646,  0.01934019,  0.21758614,  0.27971673,
        0.28838566, -0.37464115, -0.14489618, -0.05122625, -0.29655004,
        0.26109013,  0.257742  , -0.04463259, -0.47127527,  0.17396301,
        0.14960594,  0.07193516, -0.15318349, -0.03609819, -0.5039696 ,
        0.31984326,  0.28289917,  0.28357828, -0.52504206,  0.54

In [33]:
model.wv.similar_by_word('happy')

[('day', 0.9995060563087463),
 ('dear', 0.9994257092475891),
 ('hope', 0.9993958473205566),
 ('year', 0.999374270439148),
 ('new', 0.9993714094161987),
 ('life', 0.9993177652359009),
 ('well', 0.9993089437484741),
 ('love', 0.99930739402771),
 ('get', 0.9993012547492981),
 ('great', 0.9992979764938354)]

In [34]:
model.wv['happy'].shape

(100,)

In [35]:
res = np.mean([model.wv[word] for word in words[3] if word in model.wv.index_to_key],axis=0)

In [36]:
res

array([-0.20295642,  0.29536685,  0.06252173, -0.03695216,  0.03485707,
       -0.49734306,  0.13774483,  0.6576825 , -0.25406206, -0.13621952,
       -0.07359572, -0.54961044,  0.02115541,  0.0983358 , -0.00873029,
       -0.27251667,  0.06753393, -0.40919042, -0.05192734, -0.61787343,
        0.1873209 ,  0.14304511,  0.16152559, -0.14778782, -0.08479015,
        0.06905388, -0.19894056, -0.25778115, -0.2895775 ,  0.07685545,
        0.3516921 ,  0.12751076,  0.05990427, -0.24140133, -0.15476094,
        0.3603778 , -0.04192321, -0.30475807, -0.13931711, -0.51496655,
       -0.0083744 , -0.3434653 , -0.0342137 ,  0.00739776,  0.29083696,
       -0.06926309, -0.17147155,  0.01208111,  0.17118402,  0.20491101,
        0.21721633, -0.27785662, -0.12071434, -0.03545009, -0.20526344,
        0.21085973,  0.20780756, -0.03845368, -0.36420253,  0.11369876,
        0.10937617,  0.05864052, -0.11132834, -0.01580675, -0.40601218,
        0.23407525,  0.23132034,  0.21471743, -0.41889572,  0.42

In [37]:
res.dtype

dtype('float32')

In [38]:
type(res)

numpy.ndarray

In [39]:
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [40]:
type(model.wv.index_to_key)

list

In [41]:
words[3]

['dun', 'say', 'early', 'hor', 'already', 'say']

In [42]:
#apply for the entire sentences
X=[]
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))


In [43]:
X[3]

array([-0.20295642,  0.29536685,  0.06252173, -0.03695216,  0.03485707,
       -0.49734306,  0.13774483,  0.6576825 , -0.25406206, -0.13621952,
       -0.07359572, -0.54961044,  0.02115541,  0.0983358 , -0.00873029,
       -0.27251667,  0.06753393, -0.40919042, -0.05192734, -0.61787343,
        0.1873209 ,  0.14304511,  0.16152559, -0.14778782, -0.08479015,
        0.06905388, -0.19894056, -0.25778115, -0.2895775 ,  0.07685545,
        0.3516921 ,  0.12751076,  0.05990427, -0.24140133, -0.15476094,
        0.3603778 , -0.04192321, -0.30475807, -0.13931711, -0.51496655,
       -0.0083744 , -0.3434653 , -0.0342137 ,  0.00739776,  0.29083696,
       -0.06926309, -0.17147155,  0.01208111,  0.17118402,  0.20491101,
        0.21721633, -0.27785662, -0.12071434, -0.03545009, -0.20526344,
        0.21085973,  0.20780756, -0.03845368, -0.36420253,  0.11369876,
        0.10937617,  0.05864052, -0.11132834, -0.01580675, -0.40601218,
        0.23407525,  0.23132034,  0.21471743, -0.41889572,  0.42

In [44]:
type(X)

list

In [45]:
len(X)

5564

In [46]:
X_new = np.array(X)

In [48]:
words[3]

['dun', 'say', 'early', 'hor', 'already', 'say']

In [49]:
X_new[3]

array([-0.20295642,  0.29536685,  0.06252173, -0.03695216,  0.03485707,
       -0.49734306,  0.13774483,  0.6576825 , -0.25406206, -0.13621952,
       -0.07359572, -0.54961044,  0.02115541,  0.0983358 , -0.00873029,
       -0.27251667,  0.06753393, -0.40919042, -0.05192734, -0.61787343,
        0.1873209 ,  0.14304511,  0.16152559, -0.14778782, -0.08479015,
        0.06905388, -0.19894056, -0.25778115, -0.2895775 ,  0.07685545,
        0.3516921 ,  0.12751076,  0.05990427, -0.24140133, -0.15476094,
        0.3603778 , -0.04192321, -0.30475807, -0.13931711, -0.51496655,
       -0.0083744 , -0.3434653 , -0.0342137 ,  0.00739776,  0.29083696,
       -0.06926309, -0.17147155,  0.01208111,  0.17118402,  0.20491101,
        0.21721633, -0.27785662, -0.12071434, -0.03545009, -0.20526344,
        0.21085973,  0.20780756, -0.03845368, -0.36420253,  0.11369876,
        0.10937617,  0.05864052, -0.11132834, -0.01580675, -0.40601218,
        0.23407525,  0.23132034,  0.21471743, -0.41889572,  0.42

In [50]:
corpus[:3]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply']

In [51]:
y = df[list(map(lambda x: len(x)>0, corpus))]
y.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [52]:
y = y['label']
y.head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

In [53]:
y.shape

(5564,)

In [54]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0)