In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount= True)

Mounted at /content/drive/


In [11]:
import numpy as np
import re
import itertools
from collections import Counter

def load_data_and_labels():
  positive_examples = list(open("/content/drive/My Drive/Colab Notebooks/Datasets/positive.csv").readlines())
  positive_examples = [s.strip() for s in positive_examples]
  negative_examples = list(open("/content/drive/My Drive/Colab Notebooks/Datasets/negative.csv").readlines())
  negative_examples = [s.strip() for s in negative_examples]
  x_text = positive_examples + negative_examples
  x_text = [s.split(" ") for s in x_text]
  positive_labels = [[0, 1] for _ in positive_examples]
  negative_labels = [[1, 0] for _ in negative_examples]
  y = np.concatenate([positive_labels, negative_labels], 0)
  print(y.shape, "load data")
  return [x_text, y]

def pad_sentences(sentences, padding_word="<PAD/>"):
  sequence_length = max(len(x) for x in sentences)
  padded_sentences = []
  for i in range(len(sentences)):
      sentence = sentences[i]
      num_padding = sequence_length - len(sentence)
      new_sentence = sentence + [padding_word] * num_padding
      padded_sentences.append(new_sentence)
  return padded_sentences

def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

def build_input_data(sentences, labels, vocabulary):
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    print(y.shape, "build data")
    return [x, y]
  
  
def load_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]
  
x, y, vocabulary, vocabulary_inv_list = load_data()

(283967, 2) load data
(283967, 2) build data


In [0]:
!pip install gensim
from gensim.models import word2vec
from os.path import join, exists, split
import os
import numpy as np

In [0]:
def train_word2vec(sentence_matrix, vocabulary_inv,
                   num_features=100, min_word_count=30, context=10):
    
  # Set values for various parameters
  num_workers = 2  # Number of threads to run in parallel
  downsampling = 1e-3  # Downsample setting for frequent words

  # Initialize and train the model
  print('Training Word2Vec model...')
  sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
  embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                      size=num_features, min_count=min_word_count,
                                      window=context, sample=downsampling)

  # Saving the model for later use. You can load it later using Word2Vec.load()
  print('Saving Word2Vec model')
  embedding_model.save('/content/drive/My Drive/Colab Notebooks/Trained models/word2vec_model_count30')

  # add unknown words
#   embedding_weights = {key: embedding_model[word] if word in embedding_model else
#                             np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
#                        for key, word in vocabulary_inv.items()}
#   return embedding_weights
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
train_word2vec(x, vocabulary_inv)

Training Word2Vec model...
Saving Word2Vec model


In [0]:
# import gensim
model = word2vec.Word2Vec.load('/content/drive/My Drive/Colab Notebooks/Trained models/word2vec_model_count30')

In [0]:
# len(model.wv.vocab)
# vocabulary_inv_list[1691]
vocabulary['asshole']

1691

In [12]:
y.shape

(283967, 2)

In [0]:
y = y.argmax(axis=1)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]


In [14]:
y.shape

(283967,)

In [7]:
train_len = int(len(x) * 0.8)
x_train = x[:train_len]
y_train = y[:train_len]
x_test = x[train_len:]
y_test = y[train_len:]
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (227173, 76)
x_test shape: (56794, 76)
y_train shape: (227173,)
y_test shape: (56794,)


In [0]:
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
embedding_weights = {key: model[word] if word in model else
                            np.random.uniform(-0.25, 0.25, model.vector_size)
                       for key, word in vocabulary_inv.items()}
weights = np.array([v for v in embedding_weights.values()])

In [0]:
len(weights)

373311

In [0]:
import numpy as np
import pandas as pd
import os
from keras.preprocessing import text, sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

Using TensorFlow backend.


In [0]:
from keras.layers.merge import Concatenate

In [0]:
max_features = len(vocabulary_inv_list)
embedding_dims = 100
max_text_length = x_test.shape[1]
filters = 10
kernel_size = 3
hidden_dims = 50


model = Sequential()

print('Build model...')


# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=max_text_length, weights = [weights]))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:

# submodels = []
# for ks in kernel_size:
#   submodel = Sequential()
model.add(Conv1D(filters,
                   kernel_size,
                   padding='valid',
                   activation='relu',
                   strides=1))
model.add(GlobalMaxPooling1D())
#   submodels.append(submodel)

# z = Concatenate()(submodels)
# model.add(z)
# we use max pooling:
# model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Activation('relu'))

# We project onto 6 output layers, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 76, 100)           37331100  
_________________________________________________________________
dropout_1 (Dropout)          (None, 76, 100)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 74, 10)            3010      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 10)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                550       
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0     

In [0]:
print(type(x_test[0][0]))
print(x_test[0])

<class 'numpy.int64'>
[243704      7     35      4      1    530      6      1    310      4
     55    263      3    341     11    311    646    563     47      4
    218   2697      4     38    737     25      1    240     71      5
   1822     47      4    218   2697      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0]


In [0]:
model.fit(x_train, y_train, batch_size=512, epochs=10, validation_data=(x_test, y_test),  shuffle=True, verbose = 1)
model.save('/content/drive/My Drive/Colab Notebooks/Trained models/CNN_onekernel.h5')

Train on 227173 samples, validate on 56794 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
