In [None]:
# Importing data
import pandas as pd

data = pd.read_csv('SMSSpamCollection',sep='\t', names=["label","message"])

In [None]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
#Data Cleaning and Preprocessing

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [None]:
#regular expression

import re

corpus =[]

for i in range(0,len(data)):
  rev = re.sub('[^a-zA-Z0-9]',' ',data['message'][i])
  rev = rev.lower()
  rev = rev.split()
  rev = [ps.stem(word) for word in rev if word not in set(stopwords.words('english')) ]
  rev = " ".join(rev)
  corpus.append(rev)

In [None]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

# Create Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000, binary=True, ngram_range=(1,2))

In [None]:
X = cv.fit_transform(corpus).toarray()

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y = pd.get_dummies(data['label'], dtype=int)
y = y.iloc[:,1].values

In [None]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X_train,y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 1, 0, 0]))

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_class_model = MultinomialNB()

In [None]:
spam_class_model = spam_class_model.fit(X_train, y_train)

In [None]:
y_pred = spam_class_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
print(score), print(report)

0.9865470852017937
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       955
           1       0.97      0.93      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



(None, None)

In [None]:
#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=3000, binary=True, ngram_range=(1,2))

In [None]:
X = tf.fit_transform(corpus).toarray()

In [None]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_class_model = MultinomialNB()

In [None]:
spam_class_model = spam_class_model.fit(X_train, y_train)

In [None]:
y_pred = spam_class_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
print(score), print(report)

0.9838565022421525
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.98      0.98      0.98      1115



(None, None)

### Word2Vec

In [None]:
!pip install gensim



In [None]:
#simply doing with pretrained model

import gensim.downloader as api

mod = api.load('word2vec-google-news-300')

In [None]:
vec_jump = mod['jump']
vec_jump

array([ 0.05200195,  0.03613281, -0.10351562,  0.12890625, -0.00424194,
       -0.37304688,  0.03759766,  0.16894531,  0.09472656,  0.19335938,
        0.15039062, -0.08203125,  0.20214844, -0.20214844, -0.21289062,
        0.01708984,  0.27929688,  0.03295898,  0.01409912, -0.20800781,
       -0.05541992,  0.31054688, -0.03344727,  0.03613281, -0.13574219,
        0.05249023, -0.00231934,  0.06494141, -0.08935547, -0.06542969,
       -0.29296875,  0.09814453, -0.03979492, -0.33007812, -0.02685547,
       -0.09277344, -0.05517578,  0.06445312,  0.12695312,  0.17285156,
        0.25      , -0.07861328,  0.23925781,  0.05712891, -0.10449219,
       -0.24804688, -0.00196838, -0.17773438,  0.2421875 ,  0.09375   ,
        0.22265625, -0.03735352,  0.11816406,  0.09667969,  0.1953125 ,
       -0.02758789,  0.24804688, -0.00218201, -0.09570312, -0.23339844,
       -0.01696777, -0.17285156, -0.15332031,  0.01239014,  0.08984375,
        0.05053711, -0.11230469, -0.19238281, -0.18945312,  0.11

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import re
corpus =[]

for i in range(0,len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
words =[]

for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [None]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [None]:
# traing word2vec
import gensim
from gensim.models import Word2Vec


In [None]:
model = Word2Vec(words,window=5,min_count=2)


In [None]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [None]:
#count of words
model.corpus_count

5564

In [None]:
model.epochs

5

In [None]:
model.wv.most_similar('slept')

[('second', 0.9921261668205261),
 ('haha', 0.9920939803123474),
 ('drive', 0.9919605851173401),
 ('charge', 0.9919096231460571),
 ('dear', 0.9919094443321228),
 ('soon', 0.9918732047080994),
 ('use', 0.9918333888053894),
 ('give', 0.9918211102485657),
 ('dude', 0.9918036460876465),
 ('get', 0.9917847514152527)]

In [None]:
model.wv.similar_by_word('pc')

[('enough', 0.9937564134597778),
 ('shit', 0.9936032295227051),
 ('best', 0.9935293197631836),
 ('stuff', 0.9935193657875061),
 ('even', 0.9935067296028137),
 ('card', 0.9934753179550171),
 ('selected', 0.9934659600257874),
 ('ill', 0.9934616684913635),
 ('po', 0.9934383034706116),
 ('cost', 0.9934328198432922)]

## Average Word2Vec

In [None]:
import numpy as np

In [None]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [None]:
# to ckeck progress via bar we can install library
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5564/5564 [00:00<00:00, 5717.10it/s]


In [None]:
type(X), len(X)

(list, 5564)

In [None]:
len(y)

5572

In [None]:
X_new = np.array(X, dtype='object')

In [None]:
X_new.shape

(5564,)

In [None]:
X_new[0].shape

(100,)

In [None]:
## Dependent Features
## Output Features
y = data[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'],dtype=int)
y=y.iloc[:,0].values

In [None]:
y.shape

(5564,)

In [None]:
X_new[0]

array([-0.10272423,  0.25687385,  0.13761549, -0.00583476,  0.06115429,
       -0.3271942 ,  0.09019694,  0.5213966 , -0.17370987, -0.12733702,
       -0.15690443, -0.36710048, -0.05342412,  0.13630362,  0.09386166,
       -0.24832557,  0.02756587, -0.31488025,  0.00823823, -0.45214912,
        0.11164748,  0.13481511,  0.1180327 , -0.11448053, -0.086352  ,
        0.00972461, -0.1886176 , -0.15325084, -0.25113767,  0.03403864,
        0.30170512,  0.06459535,  0.13693959, -0.26416835, -0.13447626,
        0.2746174 ,  0.00635588, -0.20264994, -0.1443543 , -0.4272451 ,
        0.02544168, -0.25002912, -0.05985095,  0.07817313,  0.2733653 ,
       -0.11278042, -0.19234908, -0.00407857,  0.13771869,  0.20366023,
        0.1495673 , -0.26649755, -0.05134052, -0.03736661, -0.14332482,
        0.1913739 ,  0.14360555, -0.05600342, -0.299822  ,  0.02195933,
        0.06911779,  0.12225544, -0.10688493,  0.01893052, -0.30078876,
        0.19031842,  0.0951691 ,  0.21606351, -0.26193726,  0.30

In [None]:
df=pd.DataFrame()
for i in range(0,len(X)):
    df = pd.concat([df, pd.DataFrame(X[i].reshape(1,-1))], ignore_index=True)


In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.102724,0.256874,0.137615,-0.005835,0.061154,-0.327194,0.090197,0.521397,-0.17371,-0.127337,...,0.352413,0.138103,0.059981,0.111194,0.336272,0.293311,0.116427,-0.320032,0.108041,-0.066225
1,-0.065428,0.174977,0.089003,-0.005032,0.040083,-0.217223,0.061891,0.34684,-0.118062,-0.085481,...,0.233385,0.095816,0.042112,0.071919,0.225111,0.196263,0.080121,-0.213083,0.071538,-0.03834
2,-0.098742,0.239278,0.129831,0.002715,0.063244,-0.298578,0.082607,0.483225,-0.160259,-0.112259,...,0.330645,0.130246,0.061726,0.098876,0.316878,0.275533,0.103235,-0.296683,0.100097,-0.061132
3,-0.125143,0.312369,0.169746,-0.001896,0.078502,-0.400362,0.10831,0.6403,-0.21137,-0.156514,...,0.428589,0.164005,0.07442,0.134028,0.407521,0.36117,0.137963,-0.386223,0.130406,-0.080637
4,-0.125158,0.307653,0.16696,-0.010693,0.071055,-0.386704,0.108964,0.621008,-0.207932,-0.149258,...,0.415567,0.164009,0.076075,0.12619,0.402696,0.348396,0.141063,-0.378537,0.128631,-0.078115


In [None]:
df['Output']=y

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.102724,0.256874,0.137615,-0.005835,0.061154,-0.327194,0.090197,0.521397,-0.17371,-0.127337,...,0.138103,0.059981,0.111194,0.336272,0.293311,0.116427,-0.320032,0.108041,-0.066225,1
1,-0.065428,0.174977,0.089003,-0.005032,0.040083,-0.217223,0.061891,0.34684,-0.118062,-0.085481,...,0.095816,0.042112,0.071919,0.225111,0.196263,0.080121,-0.213083,0.071538,-0.03834,1
2,-0.098742,0.239278,0.129831,0.002715,0.063244,-0.298578,0.082607,0.483225,-0.160259,-0.112259,...,0.130246,0.061726,0.098876,0.316878,0.275533,0.103235,-0.296683,0.100097,-0.061132,0
3,-0.125143,0.312369,0.169746,-0.001896,0.078502,-0.400362,0.10831,0.6403,-0.21137,-0.156514,...,0.164005,0.07442,0.134028,0.407521,0.36117,0.137963,-0.386223,0.130406,-0.080637,1
4,-0.125158,0.307653,0.16696,-0.010693,0.071055,-0.386704,0.108964,0.621008,-0.207932,-0.149258,...,0.164009,0.076075,0.12619,0.402696,0.348396,0.141063,-0.378537,0.128631,-0.078115,1


In [None]:
#check null values
df.isnull().sum()

0         23
1         23
2         23
3         23
4         23
          ..
96        23
97        23
98        23
99        23
Output     0
Length: 101, dtype: int64

In [None]:
#remove null
df.dropna(inplace=True)

In [None]:
## Independent Feature
X=df.drop(['Output'], axis=1)

In [None]:
X.isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

In [None]:
y = df['Output']

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [None]:
#ensemble model
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_pred=classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9648331830477908


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.79      0.86       145
           1       0.97      0.99      0.98       964

    accuracy                           0.96      1109
   macro avg       0.95      0.89      0.92      1109
weighted avg       0.96      0.96      0.96      1109

