In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount= True)

Mounted at /content/drive/


In [0]:
import numpy as np
import re
import itertools
from collections import Counter

def load_data_and_labels():
  positive_examples = list(open("/content/drive/My Drive/Colab Notebooks/positive.csv").readlines())
  positive_examples = [s.strip() for s in positive_examples]
  negative_examples = list(open("/content/drive/My Drive/Colab Notebooks/negative.csv").readlines())
  negative_examples = [s.strip() for s in negative_examples]
  x_text = positive_examples + negative_examples
  x_text = [s.split(" ") for s in x_text]
  positive_labels = [[0, 1] for _ in positive_examples]
  negative_labels = [[1, 0] for _ in negative_examples]
  y = np.concatenate([positive_labels, negative_labels], 0)
  return [x_text, y]

def pad_sentences(sentences, padding_word="<PAD/>"):
  sequence_length = max(len(x) for x in sentences)
  padded_sentences = []
  for i in range(len(sentences)):
      sentence = sentences[i]
      num_padding = sequence_length - len(sentence)
      new_sentence = sentence + [padding_word] * num_padding
      padded_sentences.append(new_sentence)
  return padded_sentences

def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

def build_input_data(sentences, labels, vocabulary):
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]
  
  
def load_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]
  
x, y, vocabulary, vocabulary_inv_list = load_data()  

In [17]:
print(y)
print(x)
print(vocabulary["a"])
print(vocabulary_inv_list[0])
# print(y.argmax(axis=1))
# print(y.argmax(axis=0))
# print(len(y.argmax(axis=1)))
# print(np.random.permutation(np.arange(len(y.argmax(axis=1)))))
# np.random.permutation(np.arange(len(y)))

[[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]
[[    21  14544      2 ...      0      0      0]
 [   755     11    691 ...      0      0      0]
 [   134    100    118 ...      0      0      0]
 ...
 [113186      9      2 ...      0      0      0]
 [ 53419      9    133 ...      0      0      0]
 [     1   1078   1347 ...      0      0      0]]
2
<PAD/>


In [18]:
!pip install gensim
from gensim.models import word2vec
from os.path import join, exists, split
import os
import numpy as np



In [19]:
def train_word2vec(sentence_matrix, vocabulary_inv,
                   num_features=300, min_word_count=1, context=5):
    
  # Set values for various parameters
  num_workers = 2  # Number of threads to run in parallel
  downsampling = 1e-3  # Downsample setting for frequent words

  # Initialize and train the model
  print('Training Word2Vec model...')
  sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
  embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                      size=num_features, min_count=min_word_count,
                                      window=context, sample=downsampling)

  # Saving the model for later use. You can load it later using Word2Vec.load()
  print('Saving Word2Vec model')
  embedding_model.save('/content/drive/My Drive/Colab Notebooks/word2vec_model')

  # add unknown words
  embedding_weights = {key: embedding_model[word] if word in embedding_model else
                            np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
                       for key, word in vocabulary_inv.items()}
  return embedding_weights
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
w = train_word2vec(x, vocabulary_inv)

Training Word2Vec model...
Saving Word2Vec model




In [28]:
print(w[0])
import gensim
model12 = gensim.models.word2vec.load('word2vec_model.model')

[-2.89678723e-01 -1.14690459e+00 -6.47706985e-02 -3.41780841e-01
  7.63881300e-03  9.39129770e-01 -3.87457579e-01  4.69469041e-01
 -2.20917165e-01 -5.58248758e-01 -6.97229922e-01  2.35928938e-01
  8.92669320e-01 -5.03520109e-02 -3.15722883e-01 -1.38160899e-01
  3.95923972e-01  3.29567194e-02  3.28095779e-02 -3.95151004e-02
 -8.18917930e-01 -1.66836947e-01 -2.46940389e-01  9.48666558e-02
 -3.78007721e-03  4.12195057e-01  2.16569766e-01 -3.55429977e-01
  4.31115180e-01  2.51411617e-01 -1.10207212e+00  8.13627660e-01
  1.45208165e-01  5.16079962e-01 -2.14808613e-01 -5.40146530e-01
 -6.13349140e-01  8.42555761e-02 -6.44604536e-03 -2.51312166e-01
  5.60393512e-01  2.18838364e-01  2.83518404e-01  4.24545497e-01
 -9.42764640e-01 -1.32081628e-01  9.12622213e-01 -8.90382081e-02
 -1.12681501e-01  1.88775450e-01  2.10719034e-02 -2.94975013e-01
  2.06038684e-01  1.09159485e-01  3.43065709e-01 -6.68582082e-01
  1.45096183e-01 -7.31210530e-01  1.41639216e-02 -1.47933528e-01
 -3.68248224e-01 -9.16432

AttributeError: ignored

In [0]:
import numpy as np
# import data_helpers
# from w2v import train_word2vec

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
# from keras.datasets import imdb
from keras.preprocessing import sequence
np.random.seed(0)

model_type = "CNN-non-static"

# Model Hyperparameters
embedding_dim = 300
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 5


y = y.argmax(axis=1)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]
train_len = int(len(x) * 0.9)
x_train = x[:train_len]
y_train = y[:train_len]
x_test = x[train_len:]
y_test = y[train_len:]

if sequence_length != x_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = x_test.shape[1]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)
z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)

conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,kernel_size=sz,padding="valid",activation="relu",strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
    
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

weights = np.array([v for v in w.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])
    
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Adjusting sequence length for actual size
x_train shape: (346980, 93)
x_test shape: (38554, 93)
Vocabulary Size: 113188
Initializing embedding layer with word2vec weights, shape (113188, 300)
Train on 346980 samples, validate on 38554 samples
Epoch 1/10
