# Load pretrained Google word2vec model

In [64]:
from gensim.models import Word2Vec
model_name='GoogleNews-vectors-negative300.bin'
embedding_model = Word2Vec.load_word2vec_format(model_name, binary=True)

# Examples:
- word2vec representation of the word 'king'

In [4]:
embedding_model.wv['king']

array([  1.25976562e-01,   2.97851562e-02,   8.60595703e-03,
         1.39648438e-01,  -2.56347656e-02,  -3.61328125e-02,
         1.11816406e-01,  -1.98242188e-01,   5.12695312e-02,
         3.63281250e-01,  -2.42187500e-01,  -3.02734375e-01,
        -1.77734375e-01,  -2.49023438e-02,  -1.67968750e-01,
        -1.69921875e-01,   3.46679688e-02,   5.21850586e-03,
         4.63867188e-02,   1.28906250e-01,   1.36718750e-01,
         1.12792969e-01,   5.95703125e-02,   1.36718750e-01,
         1.01074219e-01,  -1.76757812e-01,  -2.51953125e-01,
         5.98144531e-02,   3.41796875e-01,  -3.11279297e-02,
         1.04492188e-01,   6.17675781e-02,   1.24511719e-01,
         4.00390625e-01,  -3.22265625e-01,   8.39843750e-02,
         3.90625000e-02,   5.85937500e-03,   7.03125000e-02,
         1.72851562e-01,   1.38671875e-01,  -2.31445312e-01,
         2.83203125e-01,   1.42578125e-01,   3.41796875e-01,
        -2.39257812e-02,  -1.09863281e-01,   3.32031250e-02,
        -5.46875000e-02,

- Most 10 similar words to the word 'king'

In [6]:
embedding_model.most_similar_cosmul('king',topn=10)

[('kings', 0.8569014072418213),
 ('queen', 0.8255470395088196),
 ('monarch', 0.8206589818000793),
 ('crown_prince', 0.8102101683616638),
 ('prince', 0.8079988956451416),
 ('sultan', 0.7932403683662415),
 ('ruler', 0.7898775935173035),
 ('princes', 0.782326877117157),
 ('Prince_Paras', 0.7716464996337891),
 ('throne', 0.7711045742034912)]

- 'king' - 'man' + 'woman' = 'queen'

In [8]:
embedding_model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=1) 

[('queen', 0.9314123392105103)]

- Most 10 similar words to the word 'book'

In [65]:
embedding_model.most_similar_cosmul('book',topn=10)

[('tome', 0.8742907047271729),
 ('books', 0.868958055973053),
 ('memoir', 0.8651455640792847),
 ('paperback_edition', 0.8434174656867981),
 ('autobiography', 0.8370755910873413),
 ('memoirs', 0.825256884098053),
 ('Book', 0.8239633440971375),
 ('paperback', 0.8235605359077454),
 ('novels', 0.8170721530914307),
 ('hardback', 0.8141531944274902)]

# Train embeddings on Christian news articles 

In [55]:
import os
from gensim.models import word2vec
import data_helpers
# Set values for various parameters
num_workers = 8       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features=300
min_word_count=5
context=4

TEXT_DATA_DIR='20_newsgroup\\talk.religion.misc'
sentences = []  # list of text articles
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    if fname.isdigit():
        fpath = os.path.join(TEXT_DATA_DIR, fname)
        f = open(fpath)
        sentences.append(f.read())
        f.close()
sentences = [s.strip() for s in sentences]
x_text = [data_helpers.clean_str(sent) for sent in sentences]
x_text = [s.split(" ") for s in x_text]
    
print('Found %s texts.' % len(x_text))

# Initialize and train the model
print("Training Word2Vec model...")
x,  vocabulary, vocabulary_inv =data_helpers.load_data(x_text)
sentences = [[vocabulary_inv[w] for w in s] for s in x]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)

# If we don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)

model_name='religion.misc'
# Saving the model for later use. You can load it later using Word2Vec.load()
print('Saving Word2Vec model \'%s\'' % model_name)
embedding_model.save(model_name)

Found 1000 texts.
Training Word2Vec model...
Saving Word2Vec model 'religion.misc'


In [49]:
embedding_model.vocab

{'nagasiva': <gensim.models.word2vec.Vocab at 0xe591ec0780>,
 'woe': <gensim.models.word2vec.Vocab at 0xe5917c2c88>,
 'long': <gensim.models.word2vec.Vocab at 0xe593fc2cf8>,
 'whose': <gensim.models.word2vec.Vocab at 0xe591f180b8>,
 'v2110a': <gensim.models.word2vec.Vocab at 0xe591f18fd0>,
 'jefferson': <gensim.models.word2vec.Vocab at 0xe591d14240>,
 'ridiculous': <gensim.models.word2vec.Vocab at 0xe591ec0668>,
 'restriction': <gensim.models.word2vec.Vocab at 0xe5929cf4a8>,
 'shotgun': <gensim.models.word2vec.Vocab at 0xe591fa6ba8>,
 'trodwell': <gensim.models.word2vec.Vocab at 0xe591fc7b70>,
 'paradox': <gensim.models.word2vec.Vocab at 0xe591d14278>,
 'press': <gensim.models.word2vec.Vocab at 0xe5919a4dd8>,
 'power': <gensim.models.word2vec.Vocab at 0xe5929cfa90>,
 'feet': <gensim.models.word2vec.Vocab at 0xe592acb198>,
 'insist': <gensim.models.word2vec.Vocab at 0xe591bd0550>,
 'jxw': <gensim.models.word2vec.Vocab at 0xe591cd9f28>,
 'faced': <gensim.models.word2vec.Vocab at 0xe5919a

- Most 10 similar words to the word 'book'

In [58]:
embedding_model.most_similar_cosmul('book',topn=10)

[('element', 0.9719886183738708),
 ('powers', 0.9699809551239014),
 ('church', 0.9677160382270813),
 ('son', 0.967503011226654),
 ('attack', 0.9674603939056396),
 ('parts', 0.9664692878723145),
 ('self', 0.9659683704376221),
 ('tradition', 0.9646417498588562),
 ('current', 0.9640681743621826),
 ('priesthood', 0.9605560302734375)]

# Train embeddings on Computer-related news articles 

In [59]:
import os
from gensim.models import word2vec
import data_helpers
# Set values for various parameters
num_workers = 8       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features=300
min_word_count=5
context=4

TEXT_DATA_DIR='20_newsgroup\\comp.os.ms-windows.misc'
sentences = []  # list of text articles
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    if fname.isdigit():
        fpath = os.path.join(TEXT_DATA_DIR, fname)
        f = open(fpath)
        sentences.append(f.read())
        f.close()
sentences = [s.strip() for s in sentences]
x_text = [data_helpers.clean_str(sent) for sent in sentences]
x_text = [s.split(" ") for s in x_text]
    
print('Found %s texts.' % len(x_text))

# Initialize and train the model
print("Training Word2Vec model...")
x,  vocabulary, vocabulary_inv =data_helpers.load_data(x_text)
sentences = [[vocabulary_inv[w] for w in s] for s in x]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)

# If we don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)

model_name='windows.misc'
# Saving the model for later use. You can load it later using Word2Vec.load()
print('Saving Word2Vec model \'%s\'' % model_name)
embedding_model.save(model_name)

Found 1000 texts.
Training Word2Vec model...
Saving Word2Vec model 'windows.misc'


- Most 10 similar words to the word 'book'

In [62]:
embedding_model.most_similar_cosmul('book',topn=10)

[('adjusted', 0.9946157932281494),
 ('4mb', 0.9943787455558777),
 ('harddisk', 0.9934042096138),
 ('cpu', 0.9921078681945801),
 ('specifically', 0.9917465448379517),
 ('reports', 0.9915753602981567),
 ('necessary', 0.990689218044281),
 ('cheaper', 0.9904521703720093),
 ('including', 0.9904208779335022),
 ('limited', 0.9892256855964661)]

- Google: 
tome,books,memoir,paperback_edition,autobiography,memoirs,Book,paperback,novels,hardback			
- Religion:
element,powers,church,son,attack,parts,self,tradition,current,priesthood
- Computer:
adjusted,4mb,harddisk,cpu,specifically,reports,necessary,cheaper,including,limited		


# News Classlification using CNN with word embeddings

Loading and preprocessing data

In [3]:
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys


TEXT_DATA_DIR =  '20_newsgroup.reg/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2


# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())
                f.close()
                labels.append(label_id)
print(labels_index)
print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Processing text dataset
{'soc.religion.christian': 0, 'talk.religion.misc': 1}
Found 1997 texts.
Found 33651 unique tokens.
[66, 39, 31, 23, 16, 4, 194, 31, 23, 16, 4, 247, 235, 16, 4, 212, 161, 211, 21, 262, 14, 125, 14, 4, 72, 14, 4, 24, 22, 5736, 1188, 21, 5619, 8408, 67, 88, 28, 24, 57, 77, 509, 1750, 55, 69, 51, 296, 219, 467, 630, 58, 14905, 72, 14, 4, 64, 308, 51, 91, 278, 467, 779, 76, 86, 123, 72, 14, 4, 74, 1188, 1590, 65, 748, 120, 24, 126, 14, 4, 9, 85, 5183, 310, 197, 723, 310, 58, 72, 14, 4, 17788, 19679, 3266, 4, 1652, 15132, 75, 9, 85, 5183, 242, 197, 938, 1041, 58, 9662, 72, 14, 4, 3444, 869, 23, 805, 4, 5687, 3444, 75, 89, 10, 44, 1131, 10, 1932, 60, 1454, 1, 135, 678, 60, 10, 230, 6, 167, 71, 45, 1129, 3, 509, 1750, 376, 1129, 3, 1, 915, 300, 98, 116, 5, 6, 4179, 7, 101, 20, 6676, 20, 104, 678, 60, 252, 163, 468, 1, 2531, 6, 167, 509, 1753, 19876, 5, 36, 3364, 8096, 376, 1129, 3, 618, 92, 103, 116, 8, 2860, 641, 3, 3261, 10776, 3, 1932, 135, 104, 30, 1, 8923, 2278, 9

## 1- Random Initialization

In [4]:

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,weights=None)
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=5, batch_size=128)

# save embeddings
embedding_matrix=embedding_layer.get_weights()
gensim_first_line = "{} {}".format(len(word_index), EMBEDDING_DIM)
outfile='rand.word2vec'
with open(outfile, 'w', encoding="utf-8") as fout:
    fout.write(gensim_first_line + "\n")
    for word, i in word_index.items():
        fout.write(word)
        for j in range(0,EMBEDDING_DIM):
            fout.write(' %s' % embedding_matrix[0][i][j])
        fout.write("\n")
#load embeddings            
from gensim.models import Word2Vec
embedding_model = Word2Vec.load_word2vec_format(outfile, binary=False)
# most 10 similar words to the word 'book'
embedding_model.most_similar_cosmul('book',topn=10)

Training model.
Train on 1598 samples, validate on 399 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[('movement', 0.6429674625396729),
 ('clh', 0.6362773776054382),
 ('scribes', 0.6289868354797363),
 ('sinned', 0.6255764365196228),
 ('congregation', 0.6216408014297485),
 ('rutgers', 0.6205594539642334),
 ('performance', 0.6179541945457458),
 ('services', 0.6179179549217224),
 ('zoerasterism', 0.6166352033615112),
 ('listens', 0.6152451634407043)]

## 2- Static Embeddings

Load Pre-trained Embeddings

In [5]:
from gensim.models import Word2Vec
model_name='GoogleNews-vectors-negative300.bin'
embedding_model = Word2Vec.load_word2vec_format(model_name, binary=True)

In [6]:
# Prepare embeddings
embedding_matrix = np.zeros((len(word_index)+1 , EMBEDDING_DIM))
for word, i in word_index.items():
    if word in embedding_model.vocab:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_model.wv[word]

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=5, batch_size=128)

# save embeddings
embedding_matrix=embedding_layer.get_weights()
gensim_first_line = "{} {}".format(len(word_index), EMBEDDING_DIM)
outfile='static.word2vec'
with open(outfile, 'w', encoding="utf-8") as fout:
    fout.write(gensim_first_line + "\n")
    for word, i in word_index.items():
        fout.write(word)
        for j in range(0,EMBEDDING_DIM):
            fout.write(' %s' % embedding_matrix[0][i][j])
        fout.write("\n")
#load embeddings            
from gensim.models import Word2Vec
embedding_model = Word2Vec.load_word2vec_format(outfile, binary=False)
# most 10 similar words to the word 'book'
embedding_model.most_similar_cosmul('book',topn=10)

Training model.
Train on 1598 samples, validate on 399 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)


[('books', 0.868958055973053),
 ('autobiography', 0.8370756506919861),
 ('paperback', 0.8235605359077454),
 ('novels', 0.8170720934867859),
 ('biography', 0.8077912926673889),
 ('novel', 0.8060959577560425),
 ('nonfiction', 0.7967409491539001),
 ('hardcover', 0.7966055870056152),
 ('novella', 0.7910082936286926),
 ('manuscript', 0.7879250645637512)]

## 3- Dynamic Embeddings

In [7]:
# Prepare embeddings
embedding_matrix = np.zeros((len(word_index)+1 , EMBEDDING_DIM))
for word, i in word_index.items():
    if word in embedding_model.vocab:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_model.wv[word]
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=5, batch_size=128)

# save embeddings
embedding_matrix=embedding_layer.get_weights()
gensim_first_line = "{} {}".format(len(word_index), EMBEDDING_DIM)
outfile='daynamic.word2vec'
with open(outfile, 'w', encoding="utf-8") as fout:
    fout.write(gensim_first_line + "\n")
    for word, i in word_index.items():
        fout.write(word)
        for j in range(0,EMBEDDING_DIM):
            fout.write(' %s' % embedding_matrix[0][i][j])
        fout.write("\n")
#load embeddings            
from gensim.models import Word2Vec
embedding_model = Word2Vec.load_word2vec_format(outfile, binary=False)
# most 10 similar words to the word 'book'
embedding_model.most_similar_cosmul('book',topn=10)

Training model.
Train on 1598 samples, validate on 399 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)


[('books', 0.868929386138916),
 ('autobiography', 0.8340421319007874),
 ('paperback', 0.82172030210495),
 ('novels', 0.8105382323265076),
 ('novel', 0.806250274181366),
 ('biography', 0.804276168346405),
 ('hardcover', 0.7959304451942444),
 ('nonfiction', 0.7913392186164856),
 ('novella', 0.7887855768203735),
 ('manuscript', 0.7881438136100769)]