In [None]:
import nltk
import sklearn
import pandas as pd
import numpy as np

In [None]:
messages = pd.read_csv("SMSSpamCollection.txt", sep ='\t', names = ['label','message'])

In [None]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
messages.loc[100, 'message']

"Please don't text me anymore. I have nothing else to say."

In [None]:
messages['message'].iloc[100]

"Please don't text me anymore. I have nothing else to say."

In [None]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [None]:
len(messages)

5572

In [None]:
messages.loc[1,'message']

'Ok lar... Joking wif u oni...'

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
lemmatizer = WordNetLemmatizer()


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
corpus = []
for i in range(len(messages)):
  review = re.sub('[^A-Za-z0-9]', " ", messages.loc[i,'message'])
  review = review.lower()
  review = review.split()
  #we can also use PorterStemmer.stem(word) instead of WordNetLemmatizer.lemmatizer(word)
  review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)


In [None]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply

In [None]:
len(corpus)

5572

In [None]:
type(corpus)

list

In [None]:
#creating Bag of Words Vectors
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000, binary = True, ngram_range = (2,2))
X = cv.fit_transform(corpus).toarray()

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X.shape

(5572, 2000)

In [None]:
type(X)

numpy.ndarray

In [None]:
y = pd.get_dummies(messages['label'])


In [None]:
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [None]:
y = y.iloc[:, 1].values


In [None]:
y.shape

(5572,)

In [None]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [None]:
y_pred = spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_test, y_pred)

In [None]:
print(score)

0.9766816143497757


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       982
           1       1.00      0.74      0.85       133

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
%pdoc sklearn.feature_extraction.text.TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(binary = True, max_features = 2000, ngram_range = (1,2))
X1 = tv.fit_transform(corpus)

In [None]:
X1.shape

(5572, 2000)

In [None]:
type(X1)

scipy.sparse._csr.csr_matrix

In [None]:
X1

<5572x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 41394 stored elements in Compressed Sparse Row format>

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size = 0.2)

In [None]:
tf_model = MultinomialNB().fit(X1_train, y1_train)
y1_pred = tf_model.predict(X1_test)

In [None]:
print(accuracy_score(y1_test, y1_pred))


0.9829596412556054


In [None]:
print(classification_report(y1_test, y1_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       967
           1       0.98      0.89      0.93       148

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
!pip install gensim



In [None]:
#import gensim.downloader as api
#w_v = api.load('word2vec-google-news-300')

In [None]:
from gensim.utils import simple_preprocess
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
words = []
for sent in corpus:
  if len(sent) != 0:
    sent_token = sent_tokenize(sent)
    for word in sent_token:
      df = simple_preprocess(word)
      words.append(df)
  else:
    words.append(sent)

In [None]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [None]:
type(words)

list

In [None]:
print(len(words[1]))
print(len(words[99]))

5
4


In [None]:
len(words)

5572

In [None]:
import gensim
model = gensim.models.Word2Vec(words,window = 5)

In [None]:
model.corpus_count

5572

In [None]:
model.wv.similar_by_word('prize')

[('claim', 0.9991861581802368),
 ('call', 0.9990310668945312),
 ('cash', 0.9990110397338867),
 ('guaranteed', 0.9989576935768127),
 ('draw', 0.998761773109436),
 ('awarded', 0.9987437129020691),
 ('line', 0.9987378716468811),
 ('service', 0.9986788630485535),
 ('contact', 0.9986294507980347),
 ('landline', 0.9985613226890564)]

In [None]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'go',
 'lt',
 'ok',
 'free',
 'day',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'love',
 'text',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'dear',
 'make',
 'night',
 'message',
 'say',
 'well',
 'min',
 'thing',
 'much',
 'oh',
 'great',
 'hope',
 'claim',
 'hey',
 'number',
 'give',
 'happy',
 'wat',
 'work',
 'friend',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'tone',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'service',
 'uk',
 'thanks',
 'last',
 'care',
 'anything',
 'com',
 'would',
 'year',
 'also',
 'nokia',
 'lol',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'contact',
 'urgent',
 'sent',


In [None]:
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis = 0)

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
X2 = []
for i in tqdm(range(len(words))):
  X2.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5572/5572 [00:01<00:00, 3219.69it/s]


In [None]:
type(X2)

list

In [None]:
print(len(X2[2]))
print(len(X2[200]))

100
100


In [None]:
X2

[array([-1.48409709e-01,  2.45086238e-01,  5.08038141e-02,  7.26901665e-02,
         1.17966048e-01, -3.34744722e-01,  3.11578680e-02,  5.21151781e-01,
        -1.93538964e-01, -1.69732943e-01, -1.53990403e-01, -3.05912793e-01,
        -5.32964356e-02,  1.05549209e-01,  6.28418252e-02, -2.00080246e-01,
         4.40572985e-02, -3.61875743e-01, -3.02231181e-02, -4.92810637e-01,
         7.08955675e-02,  1.92057624e-01,  1.03398085e-01, -1.48735955e-01,
        -1.50015652e-01,  2.08630487e-02, -2.58579344e-01, -2.04284295e-01,
        -2.50700742e-01,  3.25785726e-02,  2.12168351e-01, -6.39009550e-02,
         4.84433062e-02, -1.27839506e-01, -1.27974495e-01,  1.99263707e-01,
        -6.49805740e-02, -1.84512898e-01, -2.15403810e-01, -4.42298770e-01,
        -1.66719630e-02, -3.04562777e-01,  3.74349882e-04,  9.75079536e-02,
         2.18692794e-01, -1.15423985e-01, -2.13463768e-01, -1.39626255e-02,
         1.27880186e-01,  1.77263334e-01,  1.12008117e-01, -1.51438981e-01,
        -5.5

In [None]:
X_new = np.array(X2)
X_new

  X_new = np.array(X2)


array([array([-1.42047361e-01,  2.45436281e-01,  4.80380617e-02,  6.93600774e-02,
               1.20822385e-01, -3.30542833e-01,  3.50610726e-02,  5.17580748e-01,
              -1.90799996e-01, -1.68090716e-01, -1.51000559e-01, -3.05687457e-01,
              -4.78408486e-02,  1.02401495e-01,  5.19394018e-02, -2.00489700e-01,
               4.33718041e-02, -3.53216916e-01, -3.35830823e-02, -4.87333298e-01,
               7.24054649e-02,  1.87418982e-01,  9.13433954e-02, -1.52600750e-01,
              -1.50766656e-01,  1.87843405e-02, -2.52071530e-01, -2.03824997e-01,
              -2.46800333e-01,  2.96371859e-02,  2.10610613e-01, -5.78323491e-02,
               5.18317707e-02, -1.27726838e-01, -1.25170067e-01,  2.01033533e-01,
              -6.84668198e-02, -1.77475497e-01, -2.11025134e-01, -4.34139758e-01,
              -4.59310459e-03, -3.04101229e-01, -3.37128178e-04,  9.45407376e-02,
               2.15185210e-01, -1.11761235e-01, -2.15668455e-01, -1.24733122e-02,
               1

In [None]:
print(X_new.shape)
type(X_new)

(5572,)


numpy.ndarray

In [None]:
#ls = []
#for arr in X2:
  #ls1 = np.hstack(arr)
  #ls1 = ls1.tolist()
  #ls.append(ls1)
#X_n = np.array(ls)
#X_n = np.hstack(X_n)
#X_n.shape
#X_n = np.array(ls)
#X_n = X_n.reshape((5572,100))
#X_n.shape

In [None]:
#X_n = np.array(X2)
#X_n = X_n.reshape(-1,1)
#X_n.shape

  X_n = np.array(X2)


(5572, 1)

In [None]:

# #X_new = np.array(X2)
# #ls = []
# #for i in X2:
#   i = i.reshape(1,-1)
#   i = i.tolist()
#   ls1 = []
#   for j in range(len(i[0])):
#     ls1.append(i[0][j])
#   ls.append(ls1)
# #ls = np.array(ls)
# len(ls[0])


100

In [None]:
X_new[0].reshape(1,-1).shape

(1, 100)

In [None]:
df = pd.DataFrame()
for i in range(0, len(X_new)):
  df = df.append(pd.DataFrame(X_new[i].reshape(1,-1)), ignore_index = True)


  df = df.append(pd.DataFrame(X_new[i].reshape(1,-1)), ignore_index = True)


In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.142047,0.245436,0.048038,0.069360,0.120822,-0.330543,0.035061,0.517581,-0.190800,-0.168091,...,0.277528,0.138602,0.035043,0.164329,0.484314,0.268387,0.075935,-0.302603,0.110399,-0.058179
1,-0.120857,0.203337,0.042200,0.065478,0.097289,-0.273656,0.028279,0.432547,-0.160816,-0.135756,...,0.229267,0.116309,0.025781,0.132945,0.401446,0.225530,0.067511,-0.249697,0.090396,-0.042921
2,-0.151632,0.254962,0.060472,0.080388,0.125005,-0.348282,0.036268,0.528530,-0.202404,-0.178419,...,0.289785,0.143061,0.038899,0.167374,0.497096,0.287424,0.068182,-0.320715,0.122902,-0.055611
3,-0.200930,0.347136,0.074768,0.095474,0.172008,-0.472766,0.042190,0.735066,-0.269848,-0.238494,...,0.381169,0.194457,0.050295,0.228925,0.690512,0.379260,0.105109,-0.426013,0.144814,-0.086993
4,-0.160724,0.279152,0.054797,0.077434,0.135396,-0.373843,0.042473,0.579947,-0.215593,-0.186758,...,0.307176,0.164151,0.040580,0.188659,0.549250,0.301084,0.090084,-0.336962,0.120789,-0.066494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,-0.175222,0.290674,0.061987,0.087535,0.141091,-0.396262,0.050403,0.609430,-0.223359,-0.209772,...,0.329719,0.164676,0.047095,0.193946,0.574984,0.334386,0.073177,-0.378104,0.138305,-0.066697
5568,-0.157643,0.280272,0.062557,0.078388,0.134327,-0.378342,0.032522,0.590827,-0.217829,-0.186731,...,0.308602,0.156931,0.040923,0.185815,0.545352,0.305223,0.084755,-0.339304,0.120934,-0.062223
5569,-0.076096,0.108797,0.027239,0.030655,0.051376,-0.156835,0.010676,0.241828,-0.083298,-0.081803,...,0.121382,0.063175,0.010420,0.081546,0.230288,0.132472,0.023604,-0.132952,0.056226,-0.027583
5570,-0.157970,0.267925,0.058191,0.083206,0.128734,-0.364259,0.039028,0.560382,-0.212040,-0.188877,...,0.302097,0.150255,0.034285,0.179597,0.524953,0.294256,0.074031,-0.335626,0.121907,-0.059469


In [None]:
df.shape

(5572, 100)

In [None]:
df.isnull().sum()

0     77
1     77
2     77
3     77
4     77
      ..
95    77
96    77
97    77
98    77
99    77
Length: 100, dtype: int64

In [None]:
df['output'] = y

In [None]:
df.dropna(axis = 0, inplace = True)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,output
0,-0.142047,0.245436,0.048038,0.069360,0.120822,-0.330543,0.035061,0.517581,-0.190800,-0.168091,...,0.138602,0.035043,0.164329,0.484314,0.268387,0.075935,-0.302603,0.110399,-0.058179,0
1,-0.120857,0.203337,0.042200,0.065478,0.097289,-0.273656,0.028279,0.432547,-0.160816,-0.135756,...,0.116309,0.025781,0.132945,0.401446,0.225530,0.067511,-0.249697,0.090396,-0.042921,0
2,-0.151632,0.254962,0.060472,0.080388,0.125005,-0.348282,0.036268,0.528530,-0.202404,-0.178419,...,0.143061,0.038899,0.167374,0.497096,0.287424,0.068182,-0.320715,0.122902,-0.055611,1
3,-0.200930,0.347136,0.074768,0.095474,0.172008,-0.472766,0.042190,0.735066,-0.269848,-0.238494,...,0.194457,0.050295,0.228925,0.690512,0.379260,0.105109,-0.426013,0.144814,-0.086993,0
4,-0.160724,0.279152,0.054797,0.077434,0.135396,-0.373843,0.042473,0.579947,-0.215593,-0.186758,...,0.164151,0.040580,0.188659,0.549250,0.301084,0.090084,-0.336962,0.120789,-0.066494,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,-0.175222,0.290674,0.061987,0.087535,0.141091,-0.396262,0.050403,0.609430,-0.223359,-0.209772,...,0.164676,0.047095,0.193946,0.574984,0.334386,0.073177,-0.378104,0.138305,-0.066697,1
5568,-0.157643,0.280272,0.062557,0.078388,0.134327,-0.378342,0.032522,0.590827,-0.217829,-0.186731,...,0.156931,0.040923,0.185815,0.545352,0.305223,0.084755,-0.339304,0.120934,-0.062223,0
5569,-0.076096,0.108797,0.027239,0.030655,0.051376,-0.156835,0.010676,0.241828,-0.083298,-0.081803,...,0.063175,0.010420,0.081546,0.230288,0.132472,0.023604,-0.132952,0.056226,-0.027583,0
5570,-0.157970,0.267925,0.058191,0.083206,0.128734,-0.364259,0.039028,0.560382,-0.212040,-0.188877,...,0.150255,0.034285,0.179597,0.524953,0.294256,0.074031,-0.335626,0.121907,-0.059469,0


In [None]:
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
output    0
Length: 101, dtype: int64

In [None]:
X_n = df.drop('output', axis = 1)

In [None]:
y_n = df['output']

In [None]:
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X_n, y_n, test_size = 0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier().fit(X2_train, y2_train)

In [None]:
y2_pred = rfc.predict(X2_test)

In [None]:
print(accuracy_score(y2_test, y2_pred))

0.9681528662420382


In [None]:
print(classification_report(y2_test, y2_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       952
           1       0.95      0.80      0.87       147

    accuracy                           0.97      1099
   macro avg       0.96      0.90      0.93      1099
weighted avg       0.97      0.97      0.97      1099

