# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [2]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index2word

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'it',
 'your',
 'call',
 'of',
 'have',
 'that',
 'on',
 'now',
 'can',
 'are',
 'so',
 'not',
 'but',
 'or',
 'we',
 'at',
 'do',
 'get',
 'with',
 'if',
 'just',
 'will',
 'ur',
 'be',
 'no',
 'this',
 'gt',
 'lt',
 'up',
 'how',
 'from',
 'when',
 'ok',
 'go',
 'free',
 'what',
 'out',
 'all',
 'll',
 'got',
 'like',
 'am',
 'day',
 'then',
 'come',
 'good',
 'know',
 'he',
 'time',
 'its',
 'there',
 'was',
 'only',
 'love',
 'send',
 'text',
 'want',
 'as',
 'by',
 'txt',
 'one',
 'going',
 'need',
 'see',
 'home',
 'still',
 'stop',
 'reply',
 'she',
 'sorry',
 'lor',
 'today',
 'mobile',
 'our',
 'back',
 'don',
 'about',
 'da',
 'hi',
 'dont',
 'take',
 'tell',
 'later',
 'any',
 'new',
 'please',
 'did',
 'been',
 'pls',
 'think',
 'her',
 'here',
 'week',
 'they',
 'phone',
 'claim',
 'oh',
 'ì_',
 'has',
 'some',
 'much',
 'hey',
 'hope',
 'where',
 'happy',
 'dear',
 'an',
 'well',
 're',
 'msg',
 'give',
 

In [3]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word])
                     for ls in X_test])

In [4]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

22 19
8 7
6 6
6 5
8 7
6 6
6 6
25 12
24 23
23 22
22 19
7 7
11 11
6 5
7 6
5 5
24 17
13 13
6 5
8 8
4 3
7 7
12 12
14 10
9 9
26 20
35 28
22 22
12 11
5 1
22 17
7 6
6 4
18 18
6 6
30 25
11 10
26 23
9 9
11 11
5 5
7 7
25 22
8 5
6 6
10 10
8 8
12 12
11 11
6 5
22 16
23 23
10 9
12 9
6 5
10 9
4 4
31 30
8 2
13 11
6 6
13 12
42 39
23 22
24 22
21 21
14 14
9 9
2 2
8 7
14 14
18 18
4 4
10 9
28 26
15 15
10 10
14 14
21 17
15 15
24 24
1 1
6 6
6 6
28 25
12 12
28 25
17 12
9 7
11 9
6 6
8 7
13 13
30 29
20 19
6 6
12 10
7 5
13 13
5 5
13 12
9 9
4 3
14 13
7 6
8 6
31 30
7 7
20 18
13 12
9 9
5 5
16 16
8 8
11 9
8 3
7 7
26 24
22 22
26 25
27 22
1 0
19 19
8 8
26 26
25 23
25 23
29 29
23 21
8 8
9 9
31 31
17 16
1 1
12 11
6 6
7 7
4 4
16 15
8 7
14 14
6 5
12 12
21 21
2 2
4 4
6 6
17 16
6 6
27 26
22 22
9 9
18 14
18 17
5 3
23 18
5 5
27 25
13 13
9 8
20 20
6 6
13 13
12 10
25 17
19 19
25 22
5 3
20 17
6 6
29 28
4 4
6 6
23 22
30 29
13 9
22 20
7 6
16 12
21 19
2 2
6 6
15 15
24 21
23 23
13 11
24 22
11 8
16 12
4 4
17 17
28 26
30 28
5 4
30 28


In [6]:
type(w2v_vect[0])

numpy.ndarray

In [5]:
w2v_vect[0].mean(axis=0)

array([ 0.04926166,  0.21867795,  0.3428932 ,  0.33140454,  0.09872754,
       -0.4353554 ,  0.3243809 , -0.20690425,  0.31750086, -0.11791521,
        0.06970374, -0.10528845,  0.27195713,  0.00221676,  0.05412457,
       -0.05497198, -0.1184602 , -0.11580746,  0.05639719,  0.43767986,
       -0.01353129,  0.20087206, -0.09427097, -0.07042402, -0.24732323,
        0.01863688, -0.32070485,  0.0056856 , -0.09420473, -0.03496959,
        0.35425454, -0.5445007 ,  0.05326956, -0.19236243, -0.02431195,
        0.02794109,  0.3244538 , -0.36424056, -0.55527186,  0.7253942 ,
        0.21793163,  0.01648162,  0.32204798,  0.16142635, -0.03493207,
        0.02341656,  0.24573329,  0.0441433 ,  0.51901203,  0.21757884,
        0.47430494, -0.1609342 ,  0.0247377 , -0.30021408,  0.3053873 ,
        0.09433365, -0.2118545 , -0.2909658 ,  0.24171257, -0.10755377,
       -0.17006397, -0.13193403,  0.1709929 , -0.07988475,  0.55515474,
        0.1202231 , -0.0583941 ,  0.09652471, -0.20236622, -0.22

In [6]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [7]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

4 100
27 100
5 100
11 100
12 100
68 100
25 100
8 100
23 100
7 100
26 100
5 100
13 100
13 100
14 100
22 100
18 100
31 100
9 100
26 100
22 100
15 100
17 100
26 100
5 100
16 100
9 100
7 100
7 100
4 100
7 100
5 100
22 100
33 100
4 100
22 100
7 100
9 100
7 100
8 100
16 100
4 100
23 100
8 100
12 100
9 100
7 100
8 100
24 100
6 100
23 100
9 100
17 100
23 100
31 100
15 100
14 100
21 100
9 100
23 100
9 100
13 100
8 100
18 100
19 100
6 100
5 100
29 100
6 100
28 100
5 100
7 100
7 100
10 100
9 100
4 100
5 100
6 100
9 100
17 100
10 100
24 100
8 100
8 100
5 100
17 100
26 100
15 100
8 100
5 100
13 100
6 100
5 100
5 100
23 100
5 100
101 100
14 100
6 100
10 100
28 100
17 100
6 100
5 100
8 100
24 100
4 100
22 100
44 100
10 100
20 100
14 100
11 100
6 100
17 100
10 100
13 100
13 100
25 100
9 100
6 100
54 100
5 100
9 100
18 100
9 100
14 100
23 100
22 100
4 100
8 100
21 100
17 100
13 100
11 100
7 100
20 100
11 100
23 100
25 100
15 100
6 100
8 100
17 100
5 100
6 100
12 100
9 100
11 100
5 100
27 100
6 100
8 10