In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from nltk.probability import FreqDist
import collections

In [2]:
import gensim
from gensim.models.word2vec import Word2Vec

In [3]:
 # Read In Data and Format It Better
df = pd.read_csv('spam.csv', sep=',', encoding='ISO-8859-1')
df = df.drop(columns=[ "Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
  # Add 2 New Columns: For binary spam/ham indicator and length of message
df['Result']= df['v1'].map( {'spam' : int(1), 'ham' : int(0)})
df['Message_Size'] = df['v2'].apply(len)
df.head(5)

Unnamed: 0,v1,v2,Result,Message_Size
0,ham,"Go until jurong point, crazy.. Available only ...",0,111
1,ham,Ok lar... Joking wif u oni...,0,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155
3,ham,U dun say so early hor... U c already then say...,0,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61


In [5]:
 # Calculate Statistics
totalMessages = df['Result'].count()
numSpams = df[df['Result']==1]['Result'].count()
numValid = df[df['Result'] == 0]['Result'].count()
# Print Distribution
print(f'{numSpams} of {totalMessages} messages are spam: {((numSpams/totalMessages)*100)}%')

747 of 5572 messages are spam: 13.406317300789663%


In [6]:
nltk.download('punkt')
# Tokenize Each Message and add the list of tokens to the DataFrame
df['Tokens'] = df['v2'].apply(word_tokenize)
df.head(5)

[nltk_data] Downloading package punkt to /Users/yingqiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,v1,v2,Result,Message_Size,Tokens
0,ham,"Go until jurong point, crazy.. Available only ...",0,111,"[Go, until, jurong, point, ,, crazy, .., Avail..."
1,ham,Ok lar... Joking wif u oni...,0,29,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,0,49,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61,"[Nah, I, do, n't, think, he, goes, to, usf, ,,..."


In [26]:
special_words = ['...']
nltk.download('stopwords')
# Add Tokenize With Removed Stop Words
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yingqiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
def eliminateStop(words):
# Remove stopwords
    words = [w for w in words if not w in stop_words]
# Remove special words
    words = [w for w in words if not w in special_words]
# Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]
# Lowercase all words (default_stopwords are lowercase too)
    returnList = [word.lower() for word in words]
    return returnList

In [28]:
df['Filtered_Tokens'] = df['Tokens'].apply(eliminateStop)
df.head(5)

Unnamed: 0,v1,v2,Result,Message_Size,Tokens,Filtered_Tokens
0,ham,"Go until jurong point, crazy.. Available only ...",0,111,"[Go, until, jurong, point, ,, crazy, .., Avail...","[go, jurong, point, crazy, .., available, bugi..."
1,ham,Ok lar... Joking wif u oni...,0,29,"[Ok, lar, ..., Joking, wif, u, oni, ...]","[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,0,49,"[U, dun, say, so, early, hor, ..., U, c, alrea...","[dun, say, early, hor, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61,"[Nah, I, do, n't, think, he, goes, to, usf, ,,...","[nah, n't, think, goes, usf, lives, around, th..."


In [10]:
#find the 20 most common words used in spam SMS
df_spam = df.loc[df['v1'] == 'spam']
list_all_spam = df_spam['Filtered_Tokens'].tolist()
list_all_spam = [words for sublist in list_all_spam for words in sublist]

fdist = nltk.FreqDist(list_all_spam)
for word, frequency in fdist.most_common(20):
    print(u'{};{}'.format(word, frequency))

call;346
free;219
txt;156
ur;144
mobile;123
text;121
stop;114
claim;113
you;107
reply;104
prize;92
get;84
to;79
your;76
's;72
new;69
send;68
nokia;65
cash;62
urgent;62


In [16]:
#find the 20 most common words used in ham SMS
df_ham = df.loc[df['v1'] == 'ham']
list_all_ham = df_ham['Filtered_Tokens'].tolist()
list_all_ham = [words for sublist in list_all_ham for words in sublist]

fdist = nltk.FreqDist(list_all_ham)
for word, frequency in fdist.most_common(20):
    print(u'{};{}'.format(word, frequency))

's;420
'm;387
n't;345
gt;318
lt;316
get;301
'';264
go;246
ok;246
got;242
ur;237
know;234
you;233
like;231
call;230
'll;228
good;227
come;225
time;195
love;180


In [11]:
word_token = []
for i in df['Filtered_Tokens']:
    word_token.append(i)
print(word_token[: 2])

[['go', 'jurong', 'point', 'crazy', '..', 'available', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'got', 'amore', 'wat'], ['ok', 'lar', 'joking', 'wif', 'oni']]


In [12]:
word2vec_model = Word2Vec(word_token, size=500, window=3, min_count=1, workers=16)
print(word2vec_model)

Word2Vec(vocab=9362, size=500, alpha=0.025)


In [13]:
token = Tokenizer(9362)
token.fit_on_texts(df['Filtered_Tokens'])
text = token.texts_to_sequences(df['Filtered_Tokens'])
text = pad_sequences(text, 75)
print(text[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0   12 4370  738  694    1  589 1201   69  252 1202
  2866 1203   16 4371   76]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
    14  253 1547  376 1801]]


In [17]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['Result'])
y = to_categorical(y)
y[:2]

array([[1., 0.],
       [1., 0.]], dtype=float32)

In [24]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(np.array(text), y, test_size=0.2, stratify=y)

In [25]:
keras_model = Sequential()
keras_model.add(word2vec_model.wv.get_keras_embedding(True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=3, validation_data=(x_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f8f175281d0>

### reference: https://www.kaggle.com/jagannathrk/word2vec-cnn-text-classification