In [6]:
from numpy import array, asarray, zeros
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
import pandas as pd

### Defining the corpus

In [7]:
docs = pd.read_csv('../input/sentiment-analysis-for-financial-news/all-data.csv', encoding='ISO-8859-1', header=None)

In [8]:
docs.head()

### Define labels

In [9]:
label_dict = {'neutral':0, 'positive':1, 'negative':-1}

sentiment_list = [row[0] for index,row in docs.iterrows()]
temp_list = []
for iterable in sentiment_list:
    for sentiment,label in label_dict.items():
        if iterable == sentiment:
            temp_list.append(label)
            
sentiment_labels= array(temp_list)
sentiment_labels

In [10]:
docs = docs[1].tolist()

In [11]:
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [12]:
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

In [13]:
max_length = 250
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

In [14]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../input/glove6b50dtxt/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    # print(word)
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [15]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [16]:
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=250, trainable=False)

In [17]:
model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(4845, activation='relu'))
model.add(Dense(3500, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(700, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='softmax'))

In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [19]:
print(model.summary())

In [None]:
model.fit(padded_docs, sentiment_labels, epochs=80)

In [None]:
loss6b50d, accuracy6b50d = model.evaluate(padded_docs, sentiment_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
embeddings_index = dict()
f = open('../input/glove6b100dtxt/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    # print(word)
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

#create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=250, trainable=False)

model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(4845, activation='relu'))
model.add(Dense(3500, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(700, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='softmax'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

model.fit(padded_docs, sentiment_labels, epochs=80)

In [None]:
loss6b100d, accuracy6b100d = model.evaluate(padded_docs, sentiment_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
embeddings_index = dict()
f = open('../input/glove6b300dtxt/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    # print(word)
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

#create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=250, trainable=False)

model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(4845, activation='relu'))
model.add(Dense(3500, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(700, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='softmax'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

model.fit(padded_docs, sentiment_labels, epochs=80)

In [None]:
loss6b300d, accuracy6b300d = model.evaluate(padded_docs, sentiment_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
embeddings_index = dict()
f = open('../input/glove840b300dtxt/glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    # print(word)
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

#create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=250, trainable=False)

model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(4845, activation='relu'))
model.add(Dense(3500, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(700, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='softmax'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

model.fit(padded_docs, sentiment_labels, epochs=80)

In [None]:
loss840b300d, accuracy840b300d = model.evaluate(padded_docs, sentiment_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

We tried using different glove embedding modules and found results, which are as follows

In [None]:
print(accuracy6b50d, accuracy6b100d, accuracy6b300d, accuracy840b300d)