## Train word2vec model


In [9]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [11]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [12]:
messages['label']=messages['label'].map({'ham':1,'spam':0})

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

In [13]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [14]:
#  it represents all of the words that our Word2Vec model learned a vector for. 
# Or put another way, it's all of the words that appeared in the training data at least twice. So you can exp
w2v_model.wv.index_to_key 

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'have',
 'that',
 'on',
 'now',
 'are',
 'can',
 'so',
 'not',
 'or',
 'do',
 'but',
 'get',
 'we',
 'at',
 'will',
 'no',
 'ur',
 'be',
 'just',
 'if',
 'with',
 'this',
 'gt',
 'lt',
 'how',
 'up',
 'what',
 'ok',
 'when',
 'from',
 'go',
 'll',
 'all',
 'free',
 'out',
 'good',
 'got',
 'am',
 'like',
 'know',
 'day',
 'time',
 'then',
 'come',
 'there',
 'its',
 'was',
 'only',
 'he',
 'love',
 'send',
 'want',
 'txt',
 'she',
 'one',
 'as',
 'need',
 'text',
 'by',
 'going',
 'sorry',
 'lor',
 'home',
 'about',
 'see',
 'stop',
 'today',
 'still',
 'hi',
 'da',
 'reply',
 'don',
 'our',
 'back',
 'dont',
 'later',
 'think',
 'new',
 'take',
 'her',
 'mobile',
 'please',
 'pls',
 'tell',
 'has',
 'been',
 'where',
 'any',
 'did',
 'phone',
 'who',
 'ì_',
 'they',
 'dear',
 'much',
 'some',
 're',
 'msg',
 'great',
 'week',
 'wat',
 'night',
 'here',
 'give',
 'more',
 'claim',
 'hey'

In [15]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('first', 0.9949063658714294),
 ('few', 0.9948491454124451),
 ('dnt', 0.9948464632034302),
 ('give', 0.994740903377533),
 ('there', 0.9947336912155151),
 ('buy', 0.9946560859680176),
 ('bt', 0.9946480989456177),
 ('without', 0.9946187734603882),
 ('before', 0.9945974349975586),
 ('its', 0.994570255279541)]

In [16]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  """
  import sys


In [17]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

8 8
7 7
6 6
11 10
13 13
5 5
5 5
8 8
13 12
28 28
16 16
9 9
27 22
27 24
6 6
10 10
7 7
7 6
12 12
4 4
4 4
15 15
20 20
24 23
6 6
16 14
17 15
6 6
25 25
24 18
11 11
22 22
17 15
6 6
10 8
5 5
9 9
13 12
24 22
23 22
18 18
22 21
13 12
10 7
23 23
8 8
14 13
16 16
9 8
18 16
3 3
8 7
7 7
17 17
11 11
17 17
8 8
21 21
13 13
8 6
21 21
23 23
29 27
23 21
9 9
4 4
16 15
4 4
46 44
8 8
9 9
22 22
7 7
6 6
8 7
7 6
62 55
27 26
17 13
6 6
5 5
10 10
7 7
5 5
23 22
26 24
5 5
43 41
10 9
24 24
4 4
4 4
17 17
24 24
21 21
26 24
6 6
9 9
12 9
18 18
24 22
6 5
7 7
25 24
11 11
20 19
4 4
7 7
19 18
4 4
5 5
10 9
10 9
6 3
23 21
60 55
23 21
4 4
26 21
22 20
4 4
18 17
10 10
10 9
18 17
4 4
6 6
1 1
22 16
14 14
19 14
30 29
27 27
24 22
19 19
8 7
11 8
6 6
6 6
6 6
4 4
14 11
11 11
4 4
9 9
24 23
23 23
9 9
7 7
16 16
9 9
5 5
15 14
5 5
2 2
20 16
7 7
11 11
8 8
12 11
24 24
11 6
23 20
10 10
8 8
6 6
21 19
14 14
24 23
18 17
15 15
8 8
31 31
16 15
25 24
10 10
6 6
21 21
12 12
12 12
8 8
22 20
5 5
8 8
9 9
24 24
22 21
6 6
12 11
9 7
10 8
13 13
16 16
25 25
19 1

20 19
30 27
23 19
5 5
4 4
21 21
8 8
18 18
26 26
3 3
26 26
9 9
5 5
7 6
13 13
22 22
11 9
22 22
13 12
5 5
6 4
11 11
9 9
30 28
9 8
15 15
46 44
9 7
10 8
30 29
16 16
35 31
5 5
24 24
7 6
10 10
8 8
6 5
26 24
21 20
7 7
14 13
6 6
6 6
10 10
5 5
9 9
20 20
10 10
5 5
27 27
21 21
21 20
6 6
6 5
29 26
4 4
16 15
10 9
5 5
10 10
4 4
5 5
5 5
8 8
3 3
4 4
16 16
22 22
22 22
9 9
30 29
30 30
4 4
24 23
21 21
23 21
23 23
24 24
16 16
15 15
6 6
5 4
22 22
8 7
2 2
6 6
22 22
10 10
21 21
10 9
18 16
12 10
25 22
30 30
9 6
8 8
27 27
4 4
17 17
7 7
39 31
20 19
6 6
5 3
5 5
6 6
6 6
9 9
23 21
31 31
23 20
17 12
6 6
9 9
22 20
6 6
5 5
7 7
8 8
6 6
15 14
21 21
12 12
6 6
8 7
4 4
11 10
8 7
7 7
4 4
6 6
7 7
22 18
5 5
27 27
25 25
8 7
4 4
7 6
4 2
9 9
21 14
15 14
8 8
23 19
11 11
8 7
17 16
9 9
11 10
20 19
6 6
32 32
15 15
21 21
8 8
4 4
7 6
11 11
5 4
4 4
8 8
10 5
27 26
18 18
30 30
8 8
9 9
22 21
5 5
5 5
5 4
34 30
9 9
18 17
10 9
7 5
12 10
68 54
5 5
13 13
15 15
8 8
42 38
6 6
8 8
5 3
20 17
21 18
24 22
12 11
8 7
29 29
8 2
19 16
21 19
22 22
8 8
11

31 31
21 20
25 22
9 8
25 25
4 4
8 7
8 8
29 29
13 13
3 3
7 6
6 6
27 25
2 2
25 24
28 17
21 19
30 28
9 8
13 11
7 7
11 10
9 9
6 6
4 4
20 16
5 5
16 16
1 1
7 7
16 16
7 7
9 9
5 5
101 101
12 12
8 8
33 32
23 22
9 9
12 12
14 13
9 8
5 5
10 10
12 12
26 26
17 16
4 4
13 13
14 14
5 5
9 8
7 6
21 15
21 21
10 10
8 8
7 7
14 14
1 1
6 5
22 20
6 6
30 29
8 8
7 7
7 6
8 7
23 21
11 10
6 6
15 15
6 4
12 12
18 18
7 7
19 19
8 8
23 23
12 12
7 7
24 24
6 6
26 25
14 12
17 17
18 18
6 6
16 16
7 7
27 27
8 7
21 20
6 6
28 28
7 7
15 14
23 22
22 22
6 5
29 26
21 20
3 3
12 12
3 3
5 5
33 33
25 25
14 11
22 22
10 10
8 7
8 8
24 24
10 10
20 19
25 24
6 6
17 17
15 13
12 10
50 44
18 17
8 8
23 23
6 6
5 5
15 15
4 3
26 26
17 15
11 11
6 6
5 5
7 5
6 5
5 5
5 5
6 6
7 7
14 13
36 34
18 15
8 8
4 4
8 8
5 5
4 4
6 6
11 8
5 5
20 20
5 5
6 4
6 6
20 18
21 21
13 13
14 13
23 16
33 33
5 4
25 20
22 18
5 5
22 20
13 10
30 27
23 23
17 17
9 9
6 6
3 3
29 29
15 13
24 23
17 17
22 21
22 20
10 8
6 6
21 18
13 13
4 4
6 6
8 8
24 23
13 12
6 6
5 5
6 6
3 3
6 5
12 12
5 5


In [18]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [19]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

8 100
7 100
6 100
11 100
13 100
5 100
5 100
8 100
13 100
28 100
16 100
9 100
27 100
27 100
6 100
10 100
7 100
7 100
12 100
4 100
4 100
15 100
20 100
24 100
6 100
16 100
17 100
6 100
25 100
24 100
11 100
22 100
17 100
6 100
10 100
5 100
9 100
13 100
24 100
23 100
18 100
22 100
13 100
10 100
23 100
8 100
14 100
16 100
9 100
18 100
3 100
8 100
7 100
17 100
11 100
17 100
8 100
21 100
13 100
8 100
21 100
23 100
29 100
23 100
9 100
4 100
16 100
4 100
46 100
8 100
9 100
22 100
7 100
6 100
8 100
7 100
62 100
27 100
17 100
6 100
5 100
10 100
7 100
5 100
23 100
26 100
5 100
43 100
10 100
24 100
4 100
4 100
17 100
24 100
21 100
26 100
6 100
9 100
12 100
18 100
24 100
6 100
7 100
25 100
11 100
20 100
4 100
7 100
19 100
4 100
5 100
10 100
10 100
6 100
23 100
60 100
23 100
4 100
26 100
22 100
4 100
18 100
10 100
10 100
18 100
4 100
6 100
1 100
22 100
14 100
19 100
30 100
27 100
24 100
19 100
8 100
11 100
6 100
6 100
6 100
4 100
14 100
11 100
4 100
9 100
24 100
23 100
9 100
7 100
16 100
9 100
5 100
1

4 100
10 100
55 100
6 100
16 100
7 100
9 100
9 100
11 100
10 100
26 100
30 100
11 100
8 100
3 100
5 100
8 100
57 100
6 100
9 100
15 100
56 100
4 100
10 100
5 100
8 100
9 100
12 100
3 100
20 100
10 100
27 100
16 100
12 100
27 100
15 100
4 100
7 100
4 100
13 100
25 100
7 100
8 100
26 100
25 100
6 100
7 100
7 100
5 100
30 100
16 100
31 100
29 100
8 100
20 100
47 100
7 100
5 100
17 100
15 100
18 100
23 100
4 100
30 100
15 100
5 100
15 100
29 100
21 100
40 100
4 100
16 100
20 100
9 100
17 100
23 100
5 100
23 100
6 100
20 100
5 100
10 100
5 100
5 100
8 100
5 100
11 100
22 100
0 100
5 100
5 100
13 100
8 100
21 100
29 100
5 100
20 100
30 100
23 100
5 100
4 100
21 100
8 100
18 100
26 100
3 100
26 100
9 100
5 100
7 100
13 100
22 100
11 100
22 100
13 100
5 100
6 100
11 100
9 100
30 100
9 100
15 100
46 100
9 100
10 100
30 100
16 100
35 100
5 100
24 100
7 100
10 100
8 100
6 100
26 100
21 100
7 100
14 100
6 100
6 100
10 100
5 100
9 100
20 100
10 100
5 100
27 100
21 100
21 100
6 100
6 100
29 100
4 10

2 100
34 100
28 100
6 100
6 100
23 100
12 100
5 100
18 100
4 100
7 100
23 100
13 100
21 100
23 100
13 100
24 100
19 100
21 100
25 100
6 100
8 100
8 100
20 100
8 100
21 100
1 100
5 100
23 100
16 100
27 100
50 100
6 100
5 100
16 100
7 100
19 100
8 100
19 100
9 100
29 100
3 100
5 100
9 100
9 100
23 100
26 100
4 100
5 100
10 100
9 100
5 100
7 100
1 100
26 100
16 100
2 100
6 100
0 100
4 100
8 100
4 100
14 100
24 100
8 100
30 100
25 100
13 100
8 100
12 100
12 100
10 100
13 100
17 100
7 100
16 100
13 100
6 100
47 100
10 100
5 100
5 100
20 100
7 100
26 100
13 100
25 100
10 100
6 100
7 100
21 100
5 100
31 100
1 100
5 100
24 100
11 100
8 100
44 100
0 100
5 100
25 100
8 100
6 100
9 100
9 100
7 100
15 100
7 100
4 100
26 100
16 100
18 100
22 100
7 100
3 100
12 100
9 100
8 100
9 100
8 100
10 100
23 100
7 100
6 100
11 100
5 100
5 100
4 100
8 100
11 100
13 100
15 100
1 100
10 100
9 100
5 100
10 100
10 100
20 100
13 100
11 100
6 100
12 100
8 100
10 100
31 100
14 100
30 100
6 100
13 100
23 100
8 100
5 1

## Fit RandomForestClassifier On Top Of Word Vectors

In [20]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [21]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [22]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.963 / Recall: 0.991 / Accuracy: 0.96
