In [1]:
import glob
import os
import numpy as np
import pandas as pd
from math import isnan

from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

from sklearn.model_selection import train_test_split

from spacy.lang.en import English

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def gather_data(filenames):
    df = pd.read_excel(filenames[0])
    for i in range(1,len(filenames)):
        filename = filenames[i]
        df = pd.concat([df, pd.read_excel(filename)], sort=False)
    return df

In [3]:
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [4]:
# filenames = ['paragraphs_encoded_1.xlsx', 'paragraphs_encoded_2.xlsx']
filenames1 = glob.glob("Encoded WUR/*")
filenames2 = [glob.glob("Encoded Roel/*")[0]]
df = gather_data(filenames1+filenames2)
text = df.Text.to_list()
encodings = df.Sentiment.to_list()
paragraphs = []
labels = []
words = []
for i in range(len(text)):
    paragraph = text[i]
    if type(paragraph) == str:
        label = encodings[i]
        if not isnan(label):
            if label != 2.:
                paragraphs.append(paragraph)
                if label == 1. or label == 0.:
                    labels.append(0.)
                elif label == 3. or label == 4.:
                    labels.append(1.)

In [5]:
frames = [frame for frame in df.Frame.to_list() if type(frame) == str]
print('Frames: {}'.format(len(frames)))
print('Paragraphs: {}'.format(len(paragraphs)))

Frames: 1034
Paragraphs: 1800


In [6]:
words = list(set([str(word).lower() for paragraph in paragraphs for word in tokenizer(paragraph) if str(word).isalpha()]))
embedding = {}
reverse = {}
for i in range(len(words)):
    word = words[i]
    number = i + 1
    embedding[word] = number
    reverse[number] = word

In [7]:
embedded_paragraphs = [[embedding[str(token).lower()] for token in tokenizer(paragraph) if str(token).lower().isalpha()] for paragraph in paragraphs]

In [8]:
# label_documents = [3,4,1,1,2,2,2,0,3,2,4,2,1,0,4,0,4,4,1,2,3,0,4,2,4,4,0,0,3,2,4,3,1,2,4,0,1,0,4,4,3,4]
# label_documents = [2,2,0,0,1,1,1,0,2,1,2,1,0,0,2,0,2,2,0,1,2,0,2,1,2,2,0,0,2,1,2,2,0,1,2,0,0,0,2,2,2,2]

In [9]:
test_size = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(embedded_paragraphs, labels, test_size=test_size)

In [10]:
print('Average sentence length: {}'.format(round(sum([len(i) for i in embedded_paragraphs])/len(embedded_paragraphs))))
print('Maximum sentence length: {}'.format(round(max([len(i) for i in embedded_paragraphs]))))

Average sentence length: 46
Maximum sentence length: 450


In [11]:
max_words = 100
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [12]:
units = 200
embedding_size=32
vocabulary_size=len(words)+1
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(units, return_sequences=True))
model.add(LSTM(units, return_sequences=True))
model.add(LSTM(units))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           272608    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 200)          186400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 200)          320800    
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 1,100,809
Trainable params: 1,100,809
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [14]:
batch_size = 64
num_epochs = 5

X_valid, y_valid = X_train[:batch_size], Y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], Y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1556 samples, validate on 64 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7ff39ab86c18>

In [15]:
scores = model.evaluate(X_test, Y_test, verbose=0)
print('Test accuracy: {}%'.format(round(scores[1]*100,1)))

Test accuracy: 74.4%


In [16]:
# model.save('simplified_WUR+ME_acc:{}%_layers:3_units:{}_bs:{}_e:{}_max_w:{}_test_size:{}.h5'.format(round(scores[1],2), units, batch_size, num_epochs, max_words, test_size))

In [17]:
Y_predict = np.round(model.predict(X_test), 2)
for i in range(len(Y_predict)):
    print(Y_predict[i], Y_test[i])
    print([reverse[num] for num in X_test[i] if num != 0])
    print('- - - - - - - - - - - - - - - - - - - - - - - \n')

[1.] 1.0
['wind', 'energy', 'is', 'a', 'renewable', 'source', 'of', 'energy', 'wind', 'is', 'naturally', 'occurring', 'and', 'there', 'is', 'no', 'way', 'we', 'can', 'empty', 'the', 'energy', 'resources', 'wind', 'energy', 'actually', 'originates', 'from', 'the', 'nuclear', 'fusion', 'processes', 'that', 'take', 'place', 'on', 'the', 'sun']
- - - - - - - - - - - - - - - - - - - - - - - 

[0.] 0.0
['the', 'most', 'credible', 'evidence', 'presented', 'identifies', 'the', 'institute', 'in', 'wuhan', 'as', 'the', 'source', 'the', 'market', 'is', 'situated', 'only', 'miles', 'from', 'the', 'lab', 'where', 'a', 'hybrid', 'corona', 'strain', 'resembling', 'made', 'its', 'beginnings', 'and', 'had', 'been', 'isolated', 'mistakes', 'happen']
- - - - - - - - - - - - - - - - - - - - - - - 

[0.48] 1.0
['sociopolitical', 'relevance', 'of', 'gmos']
- - - - - - - - - - - - - - - - - - - - - - - 

[0.] 0.0
['many', 'noncredentialled', 'people', 'claim', 'to', 'be', 'experts', 'in', 'detoxification', '