In [1]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import gensim
import gensim.downloader as api
from gensim.utils import simple_preprocess

import tqdm
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prajwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
messages = pd.read_csv('/Users/prajwal/Developer/Prep/NLP/sms_spam_collection/SMSSpamCollection', sep = '\t', names=["lable", "message"], encoding='latin-1')

In [3]:
messages

Unnamed: 0,lable,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
messages['message'].loc[40]

'Pls go ahead with watts. I just wanted to be sure. Do have a great weekend. Abiola'

In [5]:
ps = PorterStemmer()

In [6]:
corpus = []
for i in range (0, len(messages)):
    review =re.sub('[^a-zA-z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review =' '.join(review)
    corpus.append(review)

In [7]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [8]:
# Creating the Bag of Words model
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()

In [9]:
X[1010]

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
y = pd.get_dummies(messages['lable'])
y = y.iloc[:,1].values

In [11]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) #x is independent variable and y is dependent variable

In [13]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [14]:
y_pred = spam_detect_model.predict(X_test)

In [15]:
score = accuracy_score(y_test, y_pred)
print(score, ":accuracy_score")
print(classification_report(y_test, y_pred), ":classification_report")

0.9865470852017937 :accuracy_score
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       955
           1       0.98      0.93      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115
 :classification_report


In [16]:
# Creating TF-IDF model
tv = TfidfVectorizer(max_features=2500, ngram_range=(1,3))
X = tv.fit_transform(corpus).toarray()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) #x is independent variable and y is dependent variable

In [18]:
Spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [19]:
y_pred = Spam_detect_model.predict(X_test)

In [20]:
score = accuracy_score(y_test, y_pred)
print(score, ":accuracy_score")
print(classification_report(y_test, y_pred), ":classification_report")

0.9802690582959641 :accuracy_score
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.86      0.93       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115
 :classification_report


In [21]:
# Creating Random Forest model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

In [22]:
y_pred = classifier.predict(X_test)

In [23]:
score = accuracy_score(y_test, y_pred)
print(score, ":accuracy_score")
print(classification_report(y_test, y_pred), ":classification_report")

0.9829596412556054 :accuracy_score
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.88      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115
 :classification_report


## AvgWord2Vec

In [24]:
wv = api.load('word2vec-google-news-300')

In [25]:
lemmatizer = WordNetLemmatizer()

In [26]:
corpus = []
for i in range (0, len(messages)):
    review =re.sub('[^a-zA-z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review =' '.join(review)
    corpus.append(review)

In [27]:
words = []
for sent in corpus:
    sent_tokenize= nltk.sent_tokenize(sent)
    for word in sent_tokenize:
        words.append(simple_preprocess(sent))


In [28]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [29]:
# Creating w2v model from scratch
model = gensim.models.Word2Vec(sentences=words, window=5, min_count=1, workers=4)

In [30]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'dear',
 'make',
 'night',
 'message',
 'say',
 'well',
 'min',
 'thing',
 'much',
 'great',
 'claim',
 'oh',
 'hope',
 'hey',
 'give',
 'number',
 'happy',
 'work',
 'friend',
 'wat',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'said',
 'ask',
 'win',
 'amp',
 'life',
 'cash',
 'im',
 'yeah',
 'really',
 'tone',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'last',
 'service',
 'uk',
 'thanks',
 'anything',
 'care',
 'com',
 'would',
 'year',
 'lol',
 'nokia',
 'also',
 'every',
 'feel',
 'keep',
 'pick',
 'sure',
 'contact',
 'sent',
 'urgent',


In [31]:
model.wv.similar_by_word('india')

[('place', 0.9947283864021301),
 ('com', 0.994641125202179),
 ('know', 0.9946401715278625),
 ('thk', 0.9946032166481018),
 ('went', 0.994601309299469),
 ('wont', 0.9945880770683289),
 ('say', 0.9944930672645569),
 ('bit', 0.9944672584533691),
 ('going', 0.9944663643836975),
 ('ur', 0.9944631457328796)]

In [32]:
def avg_word2vec(doc):
    return np.mean([model.wv[i] for i in doc if i in model.wv.index_to_key], axis=0) 

In [33]:
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  0%|          | 0/5565 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5565/5565 [00:00<00:00, 5912.70it/s]


In [34]:
X_new = np.array(X)

  X_new = np.array(X)


In [35]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [36]:
X_new[0]

array([-9.13633108e-02,  2.08559170e-01,  1.29898954e-02,  1.56323090e-02,
        6.46349266e-02, -2.92421132e-01,  6.16961643e-02,  3.42438251e-01,
       -1.81231380e-01, -4.54233587e-02, -1.53014198e-01, -2.56315619e-01,
       -2.54810750e-02,  6.40414283e-02,  5.09473458e-02, -1.50288463e-01,
        8.29897169e-03, -2.75674105e-01,  3.90919764e-03, -3.63362223e-01,
        1.13327667e-01,  7.46337697e-02,  5.97644746e-02, -1.09560646e-01,
       -5.15656993e-02, -1.10445637e-02, -1.38035938e-01, -1.16332702e-01,
       -1.65853530e-01,  3.89854237e-02,  2.56858885e-01,  4.14279588e-02,
        5.00513837e-02, -1.29665866e-01, -2.98753045e-02,  1.72416210e-01,
       -5.31925894e-02, -1.77010760e-01, -1.41099304e-01, -3.06949556e-01,
        3.63985561e-02, -1.34094223e-01, -4.07074355e-02,  6.06991686e-02,
        1.50739059e-01, -6.43986687e-02, -1.22584000e-01, -5.67905493e-02,
        1.02522247e-01,  1.42979264e-01,  1.15710512e-01, -1.52492598e-01,
       -6.67702183e-02, -

In [37]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.20, random_state=0) #x is independent variable and y is dependent variable

ValueError: Found input variables with inconsistent numbers of samples: [5565, 5572]

In [38]:
print(X_new.shape, y.shape)

(5565,) (5572,)


In [39]:
X_new = X_new[:5565]
y = y[:5565]

In [40]:
print(X_new.shape, y.shape)

(5565,) (5565,)


In [41]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.20, random_state=0) #x is independent variable and y is dependent variable

In [47]:
y_pred = y_pred[:1113]

In [48]:
y_pred = classifier.predict(X_test)

ValueError: setting an array element with a sequence.

In [46]:
y_pred.shape

(1115,)

In [44]:
X_test.shape

(1113,)

In [45]:
for sample in X_test:
    print(sample)

[-0.14455748  0.33716804  0.01706965  0.02693067  0.10492227 -0.4699035
  0.09563861  0.5520378  -0.29428536 -0.07486261 -0.24650031 -0.4085195
 -0.03965331  0.10609695  0.08399216 -0.24881734  0.01151614 -0.4419774
  0.00473037 -0.58298606  0.18348852  0.11649144  0.10084401 -0.17520615
 -0.07865658 -0.02549406 -0.22140056 -0.18407698 -0.2674589   0.06748173
  0.41069835  0.06809659  0.08131567 -0.20728478 -0.04681919  0.28116375
 -0.08488428 -0.2858197  -0.22817759 -0.4973159   0.05806012 -0.21984977
 -0.06629738  0.0960969   0.24238455 -0.10487314 -0.1970944  -0.09078426
  0.16280302  0.23532754  0.1852584  -0.2513376  -0.10685319 -0.03160797
 -0.19330916  0.16808821  0.10428467  0.01854704 -0.3796362   0.05850598
  0.05794918  0.12850581 -0.04661539  0.04610004 -0.32427186  0.27380896
  0.14497228  0.18102372 -0.35291836  0.41210464 -0.23169246  0.14822656
  0.34796962 -0.1284121   0.37185213  0.186871    0.00260733 -0.04758305
 -0.31555057  0.19369657 -0.14625344 -0.14143519 -0.33