In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
import yaml
import math
from sklearn.model_selection import train_test_split
import warnings
from statistics import mean

In [2]:
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)
dataset_name = "mrpolarity"
datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
                                                    cfg["datasets"][dataset_name]["negative_data_file"]["path"])
x_text, y = data_helpers.load_data_labels(datasets)
y_target = datasets['target']

  


In [3]:
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print(x)

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
[[   1    2    3 ...    0    0    0]
 [   8  173   19 ...    0    0    0]
 [ 267   26  268 ...    0    0    0]
 ...
 [  10  956   12 ...    0    0    0]
 [   1   87  162 ...    0    0    0]
 [   8  929 2110 ...    0    0    0]]


In [4]:
# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, stratify = y_train)
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_test)))
print('input shape : ', x_train.shape)

Vocabulary Size: 9683
Train/Dev split: 1024/320
input shape :  (1024, 834)


In [5]:
def load_embedding_vectors_word2vec(vocabulary, filename, binary):
    encoding = 'utf-8'
    count = 0;
    with open(filename, "rb") as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
        # initial matrix with random uniform
        embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
        if binary:
            binary_len = np.dtype('float32').itemsize * vector_size
            for line_no in range(vocab_size):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                    if ch != b'\n':
                        word.append(ch)
                word = str(b''.join(word), encoding=encoding, errors='strict')
                idx = vocabulary.get(word)
                #if word in top_words:
                if idx != 0:
                    count += 1
                    warnings.simplefilter("ignore", DeprecationWarning)
                    embedding_vectors[idx] = np.fromstring(f.read(binary_len), dtype='float32')
                else:
                    f.seek(binary_len, 1)
        else:
            for line_no in range(vocab_size):
                line = f.readline()
                if line == b'':
                    raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                parts = str(line.rstrip(), encoding=encoding, errors='strict').split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                word, vector = parts[0], list(map('float32', parts[1:]))
                idx = vocabulary.get(word)
                if idx != 0:
                    embedding_vectors[idx] = vector
        f.close()
        return embedding_vectors

def load_embedding_vectors_glove(vocabulary, filename, vector_size):
    # load embedding_vectors from the glove
    # initial matrix with random uniform
    embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        idx = vocabulary.get(word)
        if idx != 0:
            embedding_vectors[idx] = vector
    f.close()
    return embedding_vectors

In [7]:
embedding_file_path = '../GoogleNews-vectors-negative300.bin'
vocabulary = vocab_processor.vocabulary_
EMBEDDING_DIM = 300
embedding_matrix_word2vec = load_embedding_vectors_word2vec(vocabulary, embedding_file_path, EMBEDDING_DIM)
print(embedding_matrix_word2vec)

[[-0.06207478 -0.16540766 -0.01926712 ...  0.16078679  0.04741327
  -0.16492658]
 [ 0.08007812  0.10498047  0.04980469 ...  0.00366211  0.04760742
  -0.06884766]
 [ 0.25390625  0.03857422 -0.03039551 ...  0.18554688 -0.15625
   0.03881836]
 ...
 [-0.22070312 -0.02807617  0.07861328 ...  0.17773438 -0.27734375
  -0.0378418 ]
 [-0.33789062  0.20507812 -0.09765625 ... -0.22363281  0.44921875
   0.12402344]
 [ 0.07470703  0.05102539  0.05712891 ... -0.04614258 -0.21289062
  -0.02307129]]


In [None]:
embedding_file_path = 'glove/glove.6B.300d.txt'
vocabulary = vocab_processor.vocabulary_
EMBEDDING_DIM = 300
embedding_matrix_glove = load_embedding_vectors_glove(vocabulary, embedding_file_path, EMBEDDING_DIM)

In [8]:
from keras.models import Sequential
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [9]:
def generator(X_data, y_data, batch_size):

  samples_per_epoch = X_data.shape[0]
  number_of_batches = samples_per_epoch/batch_size
  counter=0

  while 1:

    X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
    y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
    counter += 1
    yield X_batch,y_batch

    #restart counter to yeild data in the next epoch as well
    if counter >= number_of_batches:
        counter = 0

In [None]:
vocabulary = vocab_processor.vocabulary_
EMBEDDING_DIM = 300
W = np.random.uniform(-0.25, 0.25, len(vocabulary)*EMBEDDING_DIM)
W = W.reshape(len(vocabulary), EMBEDDING_DIM)

In [None]:
vocabulary = vocab_processor.vocabulary_
EMBEDDING_DIM = 300
W = np.random.uniform(-1, 1, len(vocabulary)*EMBEDDING_DIM)
W = W.reshape(len(vocabulary), EMBEDDING_DIM)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib.patches as mpatches

values_w2v = []
for x in embedding_matrix_word2vec:
    for val in x:
        values_w2v.append(val)

W = np.random.uniform(-1, 1, len(vocabulary)*EMBEDDING_DIM)
X = np.random.uniform(-0.5, 0.5, len(vocabulary)*EMBEDDING_DIM)
Y = np.random.uniform(-0.25, 0.25, len(vocabulary)*EMBEDDING_DIM)

plt.rcParams['figure.figsize'] = [9, 5]

sns.distplot(values_w2v)
sns.distplot(W);
sns.distplot(X);
sns.distplot(Y);
o_patch = mpatches.Patch(color='red', label='weights: +0.25, -0.25')
r_patch = mpatches.Patch(color='green', label='weights: +0.50, -0.50')
p_patch = mpatches.Patch(color='orange', label='weights: +1, -1')
b_patch = mpatches.Patch(color='blue', label='Weights assigned by word2vec')
plt.legend(handles=[o_patch, r_patch, p_patch, b_patch])

In [15]:
from keras import regularizers
all_accuracies = []
all_variance = []
#test_sizes = [0.95, 0.90, 0.80, 0.70, 0.60, 0.50, 0.40, 0.30, 0.20, 0.10]
test_sizes = [0.95]

for test_size in test_sizes:
    print('Test size : ',test_size)
    accuracies = []
    epochs_size = 100
    for i in range(0, 5):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, stratify = y)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, stratify = y_train)

        callback = [EarlyStopping(monitor='val_loss',
                                      min_delta=0,
                                      patience=0,
                                      verbose=0, mode='auto')]
        batch_size = 32
        steps_for_each_epoch = int( np.ceil(x_train.shape[0] / batch_size) )
#         print(x_train.shape[0], steps_for_each_epoch)
#         sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True))
        classifier = Sequential()
        classifier.add(Embedding(len(vocabulary), EMBEDDING_DIM, weights=[embedding_matrix_word2vec], input_length=x_train.shape[1])) #trainable=False)
        classifier.add(Conv1D(32, 3, activation = 'relu'))
        classifier.add(MaxPooling1D(pool_size = 3))
        classifier.add(Flatten())
        classifier.add(Dense(units = 16, activation = 'sigmoid', kernel_regularizer = regularizers.l2(0.0001)))
        classifier.add(Dropout(0.50))
        classifier.add(Dense(units = 2, activation = 'softmax'))
        classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        #print(classifier.summary())#, callbacks = callback
        history = classifier.fit_generator(generator(x_train, y_train, batch_size),callbacks = callback, epochs = 20, steps_per_epoch = steps_for_each_epoch, validation_data = (x_val, y_val), validation_steps = 100, shuffle = True, use_multiprocessing = True, verbose = 0)
#         print(history.history["val_loss"])
        loss, accuracy = classifier.evaluate(x_test, y_test)
        #print('Accuracy: %f' % (accuracy*100))
        accuracies.append(accuracy)
#     print(accuracies)
#     print(np.mean(accuracies))
#     print(np.var(accuracies))
    all_accuracies.append(np.mean(accuracies))
    all_variance.append(np.var(accuracies))
print('Accuracies : ', all_accuracies)
print('Variance : ', all_variance)

Test size :  0.95


KeyboardInterrupt: 

In [14]:
import pickle
filename = 'save_model.pkl'
pickle.dump(classifier, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
a, b = loaded_model.evaluate(x_test, y_test)
print(a, b)

0.7948317879124691 0.5


In [None]:
Accuracies :  [0.5830263157894737, 0.6704166666666667, 0.67359375, 0.6435714285714286, 0.7758333333333333, 0.83925, 0.8649999999999999, 0.9266666666666667, 0.92875, 0.9237499999999998]
Variance :  [0.004138054016620498, 0.00067295524691358, 0.0026590332031249967, 0.005238966836734694, 0.010931163194444446, 0.008273499999999996, 0.010642187500000004, 0.00013611111111111113, 8.359374999999987e-05, 0.00039687499999999933]
    

In [None]:
with regularization

Accuracies :  [0.5621052631578948, 0.686875, 0.8456249999999998, 0.8074999999999999, 0.8838541666666666, 0.850625, 0.9390625, 0.9393750000000001, 0.931875, 0.936875]
Variance :  [0.0050677977839335185, 0.011039009452160494, 0.008696557617187501, 0.015737595663265307, 0.008059733072916664, 0.011838203124999998, 5.3222656250000156e-05, 0.00021132812500000006, 0.00041953124999999977, 0.0006128906249999999]

In [34]:
weights : -1, +1 
[0.7404935956001282, 0.7478448748588562, 0.6508960127830505, 0.6543195247650146, 0.765687495470047, 0.6632035672664642, 0.7020750939846039, 0.7374310493469238, 0.7079029679298401, 0.7221793234348297, 0.7645464539527893, 0.7894353568553925, 0.7979920208454132, 0.8461835384368896, 0.8875622153282166, 0.9119525253772736, 0.9236847162246704, 0.9429888725280762, 0.9228183925151825, 0.9201494455337524]

weights : -0.50, +0.50
[0.7529853284358978, 0.7550478279590607, 0.6417356133460999, 0.6444882154464722, 0.6766594052314758, 0.6598062813282013, 0.8815954625606537, 0.7431097328662872, 0.7604756951332092, 0.713350772857666, 0.7318406999111176, 0.7911670804023743, 0.829595148563385, 0.8419949412345886, 0.8437898457050323, 0.8505183756351471, 0.8709539771080017, 0.8794790804386139, 0.8905494213104248, 0.8980833292007446]

weights : word2vec
[0.7263475656509399, 0.7861708700656891, 0.6272746473550797, 0.627626970410347, 0.6100166887044907, 0.5543903410434723, 0.4856374114751816, 0.3910641223192215, 0.32671790570020676, 0.3506401628255844, 0.29860566556453705, 0.2849557548761368, 0.3232623040676117, 0.2825904116034508, 0.2753669172525406, 0.2933805137872696, 0.28422221541404724, 0.2705959901213646, 0.2731320708990097, 0.2936372309923172]

SyntaxError: invalid syntax (<ipython-input-34-4d3b18f54cb4>, line 1)

"For partition 5:95, 10 runs. without word2vec"
[0.6256578947368421, 0.5, 0.5, 0.6532894736842105, 0.5, 0.5006578947368421, 0.5, 0.625, 0.6664473684210527, 0.6638157894736842]
mean accuracy : 0.5734868421052631
variance : 0.005545018178670362

with word2vec 
[0.6131578947368421, 0.6776315789473685, 0.6664473684210527, 0.8, 0.525, 0.6585526315789474, 0.6690789473684211, 0.6552631578947369, 0.6690789473684211, 0.6605263157894737]
0.6594736842105263
0.004055851800554017

with glove

[0.5894736842105263, 0.6177631578947368, 0.5572368421052631, 0.6730263157894737, 0.5, 0.5, 0.65, 0.7289473684210527, 0.6381578947368421, 0.5046052631578948]
0.595921052631579
0.0057105090027700835


"For partition 10:90, 10 runs.without word2vec"
[0.6173611111111111, 0.6840277777777778, 0.6159722222222223, 0.6611111111111111, 0.5104166666666666, 0.6694444444444444, 0.6416666666666667, 0.6590277777777778, 0.6465277777777778, 0.6340277777777777]
mean accuracy : 0.6339583333333334
variance : 0.0021210889274691364

with word2vec
[0.6604166666666667, 0.7902777777777777, 0.6631944444444444, 0.6708333333333333, 0.66875, 0.8597222222222223, 0.6569444444444444, 0.8680555555555556, 0.5895833333333333, 0.5]
0.6927777777777777
0.012017669753086425

with glove 
[0.6555555555555556, 0.6680555555555555, 0.6923611111111111, 0.6611111111111111, 0.64375, 0.70625, 0.7673611111111112, 0.6326388888888889, 0.6583333333333333, 0.6659722222222222]
0.6751388888888888
0.0013557870370370386

"For partition 15:85, 10 runs. without word2vec"
mean accuracy : 0.6258088235294117
variance : 0.003372561634948095

with word2vec
0.7704411764705882
0.010178395328719721

"For partition 20:80, 10 runs. without word2vec"
[0.6703125, 0.66640625, 0.64453125, 0.665625, 0.66796875, 0.66171875, 0.73203125, 0.55703125, 0.6546875, 0.66875]
mean accuracy : 0.65890625
variance : 0.00034194946289062506

with word2vec
[0.91640625, 0.81953125, 0.678125, 0.63515625, 0.90078125, 0.6625, 0.87421875, 0.890625, 0.8828125, 0.896875]
0.815703125
0.011252008056640627

with glove
[0.7109375, 0.68515625, 0.67109375, 0.60859375, 0.63671875, 0.74609375, 0.884375, 0.6515625, 0.62578125, 0.66484375]
0.6885156250000001
0.005742559814453126

"For partition 30:70, 10 runs. without word2vec"
[0.8098214285714286, 0.7901785714285714, 0.6785714285714286, 0.6839285714285714, 0.6839285714285714, 0.64375, 0.6741071428571429, 0.6678571428571428, 0.6616071428571428, 0.8294642857142858]
mean accuracy : 0.7123214285714287
variance : 0.004275318877551021

with word2vec
[0.8803571428571428, 0.7607142857142857, 0.8357142857142857, 0.8910714285714286, 0.8723214285714286, 0.8901785714285714, 0.55625, 0.8160714285714286, 0.9178571428571428, 0.8178571428571428]
0.8238392857142858
0.009913113839285712


with glove
[0.8839285714285714, 0.6455357142857143, 0.8660714285714286, 0.8276785714285714, 0.8178571428571428, 0.6669642857142857, 0.8767857142857143, 0.6133928571428572, 0.7169642857142857, 0.8571428571428571]
0.7772321428571428
0.009974689094387753

"For partition 40:60, 10 runs. without word2vec"
[0.63125, 0.6973214285714285, 0.6357142857142857, 0.6107142857142858, 0.6848214285714286, 0.7348214285714286, 0.66875, 0.6491071428571429, 0.6589285714285714, 0.6598214285714286, 0.7020833333333333, 0.6166666666666667, 0.8208333333333333, 0.6760416666666667, 0.61875, 0.7333333333333333, 0.6739583333333333, 0.7677083333333333, 0.7864583333333334, 0.71875]
mean accuracy : 0.6872916666666666
variance : 0.003242219387755101

with word2vec
0.9004166666666666
0.0021658420138888903

with glove
[0.9041666666666667, 0.9010416666666666, 0.8416666666666667, 0.7927083333333333, 0.7583333333333333, 0.8989583333333333, 0.8385416666666666, 0.9083333333333333, 0.8604166666666667, 0.9041666666666667]
0.8608333333333332
0.0025093750000000003

"For partition 50:50, 10 runs. without word2vec"
[0.74125, 0.79125, 0.74625, 0.625, 0.67375, 0.83125, 0.67125, 0.71875, 0.7575, 0.67625]
mean accuracy : 0.72325
variance : 0.003535062500000001

with word2vec
[0.95625, 0.89375, 0.945, 0.88375, 0.94, 0.91375, 0.88875, 0.90375, 0.875, 0.8825]
0.90825
0.0007672499999999994

with glove
[0.8125, 0.88375, 0.8525, 0.8975, 0.85375, 0.79875, 0.8275, 0.885, 0.8775, 0.82]
0.8508749999999999
0.0010828281250000005

"For partition 60:40, 10 runs. without word2vec"
[0.63125, 0.6973214285714285, 0.6357142857142857, 0.6107142857142858, 0.6848214285714286, 0.7348214285714286, 0.66875, 0.6491071428571429, 0.6589285714285714, 0.6598214285714286, 0.7020833333333333, 0.6166666666666667, 0.8208333333333333, 0.6760416666666667, 0.61875, 0.7333333333333333, 0.6739583333333333, 0.7677083333333333, 0.7864583333333334, 0.71875, 0.75875, 0.81625, 0.75875, 0.74, 0.8125, 0.795, 0.67, 0.84625, 0.73, 0.7875, 0.8234375, 0.8359375, 0.8234375, 0.8875, 0.80625, 0.778125, 0.85625, 0.7796875, 0.83125, 0.8234375]
mean accuracy : 0.7834375
variance : 0.0035870117187500007

with word2vec
[0.8927083333333333, 0.8864583333333333, 0.9208333333333333, 0.9447916666666667, 0.890625, 0.91875, 0.934375, 0.9229166666666667, 0.9322916666666666, 0.89375]
0.9137500000000001
0.00040143229166666637

with glove
[0.909375, 0.9296875, 0.890625, 0.7703125, 0.884375, 0.925, 0.875, 0.8515625, 0.86875, 0.8765625]
0.878125
0.0018457031250000016

"For partition 70:30, 10 runs. without word2vec"
[0.784375, 0.653125, 0.8140625, 0.840625, 0.803125, 0.803125, 0.7796875, 0.775, 0.8734375, 0.7078125]
mean accuracy : 0.8045833333333332
variance : 0.003388715277777779

with word2vec
[0.9475, 0.91, 0.93875, 0.8725, 0.9525, 0.9475, 0.94625, 0.95, 0.9175, 0.93]
0.93125
0.0005706249999999992

with glove
[0.9395833333333333, 0.8416666666666667, 0.91875, 0.9, 0.9020833333333333, 0.8958333333333334, 0.9020833333333333, 0.9083333333333333, 0.9125, 0.9041666666666667]
0.9025000000000001
0.0005527777777777773

"For partition 75:25, 10 runs. without word2vec"
mean accuracy : 0.827
variance : 0.0013559999999999993

"For partition 80:20, 10 runs. without word2vec"
[0.859375, 0.8375, 0.871875, 0.884375, 0.878125, 0.903125, 0.875, 0.63125, 0.834375, 0.859375]
mean accuracy : 0.8434375
variance : 0.005389550781250001

with word2vec
[0.91875, 0.95625, 0.9, 0.9, 0.903125, 0.94375, 0.934375, 0.9625, 0.928125, 0.94375]
0.9290624999999999
0.0004786132812500003

with glove
[0.9125, 0.9375, 0.8875, 0.940625, 0.91875, 0.95625, 0.90625, 0.88125, 0.9375, 0.884375]
0.91625
0.0006285156250000008

"For partition 90:10, 10 runs. without word2vec"
[0.925, 0.8625, 0.86875, 0.84375, 0.8375, 0.9, 0.86875, 0.85, 0.9125, 0.875]
mean accuracy : 0.8743749999999999
variance : 0.0007769531250000002

with word2vec
[0.9375, 0.95625, 0.95, 0.91875, 0.94375, 0.925, 0.94375, 0.90625, 0.95, 0.93125]
0.93625
0.00022499999999999986

with glove
[0.91875, 0.925, 0.9375, 0.8875, 0.95, 0.9375, 0.95625, 0.91875, 0.95, 0.89375]
0.9275
0.00049375

"For partition 90:10, 10 runs. with word2vec"
mean accuracy : 0.9243750000000001
variance : 0.0008863281250000002

In [31]:
print(accuracies)
print(np.mean(accuracies))
print(np.var(accuracies))

[0.6616071428571428, 0.7160714285714286, 0.5758928571428571, 0.6964285714285714, 0.6553571428571429, 0.6410714285714286, 0.6366071428571428, 0.5883928571428572, 0.6375, 0.6848214285714286]
0.6493749999999999
0.0017595742984693884
