In [86]:
import glob
import os
import numpy as np
import pandas as pd
from math import isnan

from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

from sklearn.model_selection import train_test_split

from spacy.lang.en import English

In [2]:
def gather_data(filenames):
    df = pd.read_excel(filenames[0])
    for i in range(1,len(filenames)):
        filename = filenames[i]
        df = pd.concat([df, pd.read_excel(filename)], sort=False)
    return df

In [3]:
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [87]:
# filenames = ['paragraphs_encoded_1.xlsx', 'paragraphs_encoded_2.xlsx']
filenames = glob.glob("Encoded/*")
df = gather_data(filenames)
text = df.Text.to_list()
encodings = df.Sentiment.to_list()
paragraphs = []
labels = []
words = []
for i in range(len(text)):
    paragraph = text[i]
    if type(paragraph) == str:
        label = encodings[i]
        if not isnan(label):
    #         paragraphs.append(paragraph)
    #         try:
    #             labels.append(int(label))
    #         except:
    #             print(i, paragraph)
            if label != 2.:
                paragraphs.append(paragraph)
                if label == 1. or label == 0.:
                    labels.append(0.)
                elif label == 3. or label == 4.:
                    labels.append(1.)
    #         else:
    #             labels.append(0.5)

In [88]:
frames = [frame for frame in df.Frame.to_list() if type(frame) == str]
print('Frames: {}'.format(len(frames)))
print('Paragraphs: {}'.format(len(paragraphs)))

Frames: 654
Paragraphs: 1011


In [89]:
words = list(set([str(word).lower() for paragraph in paragraphs for word in tokenizer(paragraph) if str(word).isalpha()]))
embedding = {}
reverse = {}
for i in range(len(words)):
    word = words[i]
    number = i + 1
    embedding[word] = number
    reverse[number] = word

In [90]:
embedded_paragraphs = [[embedding[str(token).lower()] for token in tokenizer(paragraph) if str(token).lower().isalpha()] for paragraph in paragraphs]

In [91]:
# label_documents = [3,4,1,1,2,2,2,0,3,2,4,2,1,0,4,0,4,4,1,2,3,0,4,2,4,4,0,0,3,2,4,3,1,2,4,0,1,0,4,4,3,4]
# label_documents = [2,2,0,0,1,1,1,0,2,1,2,1,0,0,2,0,2,2,0,1,2,0,2,1,2,2,0,0,2,1,2,2,0,1,2,0,0,0,2,2,2,2]

In [107]:
X_train, X_test, Y_train, Y_test = train_test_split(embedded_paragraphs, labels, test_size=0.25)

In [108]:
print('Average sentence length: {}'.format(round(sum([len(i) for i in embedded_paragraphs])/len(embedded_paragraphs))))
print('Maximum sentence length: {}'.format(round(max([len(i) for i in embedded_paragraphs]))))

Average sentence length: 48
Maximum sentence length: 382


In [109]:
max_words = 100
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [110]:
units = 200
embedding_size=32
vocabulary_size=len(words)+1
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(units, return_sequences=True))
model.add(LSTM(units, return_sequences=True))
model.add(LSTM(units))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 32)           205600    
_________________________________________________________________
lstm_10 (LSTM)               (None, 100, 200)          186400    
_________________________________________________________________
lstm_11 (LSTM)               (None, 100, 200)          320800    
_________________________________________________________________
lstm_12 (LSTM)               (None, 200)               320800    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 201       
Total params: 1,033,801
Trainable params: 1,033,801
Non-trainable params: 0
_________________________________________________________________
None


In [111]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [112]:
batch_size = 32
num_epochs = 10

X_valid, y_valid = X_train[:batch_size], Y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], Y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Train on 726 samples, validate on 32 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fc816a170f0>

In [114]:
scores = model.evaluate(X_test, Y_test, verbose=0)
print('Test accuracy:', scores[1])

Test accuracy: 0.739130437374115


In [117]:
# model.save('simplified_acc:{}%_layers:3_units_bs:{}_e:{}_max_words:{}.h5'.format(round(scores[1],2), units, batch_size, num_epochs, max_words))

In [115]:
Y_predict = np.round(model.predict(X_test), 2)

In [116]:
for i in range(len(Y_predict)):
    print(Y_predict[i], Y_test[i])
    print([reverse[num] for num in X_test[i] if num != 0])
    print('- - - - - - - - - - - - - - - - - - - - - - - \n')

[1.] 0.0
['although', 'touted', 'as', 'a', 'climate', 'friendly', 'alternative', 'unconventional', 'gas', 'is', 'potentially', 'more', 'greenhouse', 'gas', 'intensive', 'than', 'coal']
- - - - - - - - - - - - - - - - - - - - - - - 

[1.] 1.0
['as', 'clearly', 'articulated', 'in', 'our', 'national', 'integrated', 'resource', 'plan', 'nirp', 'shale', 'gas', 'together', 'with', 'other', 'energy', 'sources', 'form', 'an', 'integral', 'part', 'of', 'this', 'vision', 'with', 'its', 'ultimate', 'aim', 'being', 'to', 'support', 'our', 'societal', 'transformation', 'and', 'economic', 'growth']
- - - - - - - - - - - - - - - - - - - - - - - 

[0.01] 1.0
['geothermal', 'energy', 'has', 'the', 'potential', 'to', 'play', 'a', 'significant', 'role', 'in', 'moving', 'the', 'united', 'states', 'and', 'other', 'regions', 'of', 'the', 'world', 'toward', 'a', 'cleaner', 'more', 'sustainable', 'energy', 'system', 'it', 'is', 'one', 'of', 'the', 'few', 'renewable', 'energy', 'technologies', 'that', 'can', '

[1.] 1.0
['improved', 'packaging', 'whereby', 'nanomaterials', 'are', 'mixed', 'into', 'the', 'polymer', 'matrix', 'to', 'improve', 'the', 'gas', 'barrier', 'properties', 'as', 'well', 'as', 'temperature', 'and', 'humidity', 'resistance', 'of', 'the', 'packaging']
- - - - - - - - - - - - - - - - - - - - - - - 

[0.] 0.0
['a', 'mother', 'wants', 'federal', 'agencies', 'to', 'protect', 'the', 'public', 'like', 'many', 'parents', 'and', 'caregivers', 'i', 'had', 'to', 'feed', 'my', 'child', 'formula', 'i', 'am', 'outraged', 'that', 'these', 'poorly', 'studied', 'virtually', 'unregulated', 'and', 'unlabeled', 'nanomaterials', 'are', 'present', 'in', 'infant', 'formula', 'when', 'there', 'are', 'suitable', 'non', 'nano', 'ingredients', 'that', 'have', 'been', 'used', 'for', 'decades', 'and', 'do', 'carry', 'the', 'same', 'risks', 'the', 'fda', 'must', 'act', 'immediately', 'to', 'put', 'a', 'moratorium', 'on', 'the', 'use', 'of', 'nanomaterials', 'in', 'formula', 'and', 'other', 'food', 'un