# word2vec test


Todo:

- Remove /n

## Step 1: Load Tweets

In [1]:
import os 
import pandas as pd
import numpy as np
import gensim
import Cython

In [2]:
DATA_PATH = './twitter-datasets/'
TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt')
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt')
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [3]:
def read_tweets(path):
    with open(path) as f:
        return f.readlines()

In [30]:
raw_neg = pd.DataFrame(read_tweets(TRAIN_NEG_PATH), columns=['raw_tweet'])
raw_neg['score'] = -1
raw_pos = pd.DataFrame(read_tweets(TRAIN_POS_PATH), columns=['raw_tweet'])
raw_pos['score'] = 1
raw_tweets = pd.concat([raw_neg, raw_pos], axis=0)

In [29]:
raw_tweets.head()

Unnamed: 0,raw_tweet,score
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,-1
1,glad i dot have taks tomorrow ! ! #thankful #s...,-1
2,1-3 vs celtics in the regular season = were fu...,-1
3,<user> i could actually kill that girl i'm so ...,-1
4,<user> <user> <user> i find that very hard to ...,-1


In [28]:
raw_tweets = raw_tweets.reset_index(drop=True)

In [31]:
raw_tweets['split_tweets'] = raw_tweets['raw_tweet'].str.split(' ')

In [32]:
raw_tweets.head()

Unnamed: 0,raw_tweet,score,split_tweets
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,-1,"[vinco, tresorpack, 6, (, difficulty, 10, of, ..."
1,glad i dot have taks tomorrow ! ! #thankful #s...,-1,"[glad, i, dot, have, taks, tomorrow, !, !, #th..."
2,1-3 vs celtics in the regular season = were fu...,-1,"[1-3, vs, celtics, in, the, regular, season, =..."
3,<user> i could actually kill that girl i'm so ...,-1,"[<user>, i, could, actually, kill, that, girl,..."
4,<user> <user> <user> i find that very hard to ...,-1,"[<user>, <user>, <user>, i, find, that, very, ..."


## Step 2: Convert Tweets to vectors

In [None]:
# Load Google's pre-trained Word2Vec model.
#gensim.models.KeyedVectors.load_word2vec_format
#model = gensim.models.Word2Vec.load('/tmp/mymodel')
#model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin',binary=True) 

In [8]:
model = gensim.models.Word2Vec(raw_tweets['split_tweets'], min_count=1, workers=4)

In [None]:
model.save(os.path.join("models", "own_model.w2v"))

In [9]:
print(model)

Word2Vec(vocab=125642, size=100, alpha=0.025)


In [10]:
model.wv['walk'][:10]

array([-1.27893901,  0.63710999,  1.22609246, -0.96996796, -0.10930096,
        0.91625458,  0.21599129,  0.13468453, -0.60631526,  0.7564019 ], dtype=float32)

In [11]:
# When training finished:
word_vectors = model.wv
del model

In [12]:
word_vectors.wv['computer'].shape

(100,)

## Step 3: Build sentences of vectors

In [None]:
word_vectors.vocab

In [13]:
word_vectors.most_similar('car')

[('room', 0.7656147480010986),
 ('car\n', 0.7620103359222412),
 ('reset', 0.742739737033844),
 ('head', 0.722123920917511),
 ('bed\n', 0.7090008854866028),
 ('hand\n', 0.7073951363563538),
 ('door', 0.7024655938148499),
 ('shirt', 0.7021811008453369),
 ('ride', 0.7010906934738159),
 ('tattoo\n', 0.6975417733192444)]

In [14]:
word_vectors.most_similar(positive=['king', 'man'], negative=['woman'], topn=1)

[('tru', 0.5975828170776367)]

In [15]:
raw_tweets['split_tweets'] = raw_tweets['split_tweets'].apply(lambda x: np.array(x))

In [16]:
tweets = np.array(raw_tweets['split_tweets'])

In [17]:
tweets[10]

array(['introduction', 'to', 'programming', 'with', 'c', '+', '+', '(',
       '2nd', 'edition', 'this', 'solid', 'foundation', 'in', 'the',
       'basics', 'of', 'c', '+', '+', 'programming', 'will', '...',
       '<url>\n'],
      dtype='<U12')

In [None]:
tweets_vec = [None]*200000

for idx_t, tweet in enumerate(tweets):
    sentence = np.empty((tweet.shape[0], 100))
    for idx_w, word in enumerate(tweet):
        sentence[idx_w] = word_vectors.wv[word]
    tweets_vec[idx_t] = sentence

In [None]:
max_i= 0
min_i = 10000
max_j = 0
min_j = 10000
for t in tweets_vec:
    i = t.shape[0]
    j = t.shape[1]
    if i > max_i:
        max_i = i
    if i < min_i:
        min_i = i
    if j > max_j:
        max_j = j
    if j < min_j:
        min_j = j
        
print(max_i, max_j, min_i, min_j)

In [None]:
tweets_vec[0].shape

In [None]:
x_train = np.empty((200000, 100))

for idx_t, tweet in enumerate(tweets):
    sentence = np.empty((tweet.shape[0], 100))
    for idx_w, word in enumerate(tweet):
        sentence[idx_w] = word_vectors.wv[word]
    x_train[idx_t] = sentence.mean(axis=0)

x_train

In [None]:
x_train.shape

In [None]:
y_train = np.array(raw_tweets['score'])
# convert class vectors to binary class matrices
#y_train = keras.utils.to_categorical(y_train, 2)
y_train[1]