In [188]:
from numpy import array, asarray, zeros
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
import pandas as pd

### Defining the corpus

In [189]:
docs = pd.read_csv('all-data.csv', encoding='ISO-8859-1', header=None)

In [190]:
docs.head()

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


### Define labels

In [191]:
label_dict = {'neutral':0, 'positive':1, 'negative':-1}

sentiment_list = [row[0] for index,row in docs.iterrows()]
temp_list = []
for iterable in sentiment_list:
    for sentiment,label in label_dict.items():
        if iterable == sentiment:
            temp_list.append(label)
            
sentiment_labels= array(temp_list)
sentiment_labels

array([ 0,  0, -1, ..., -1, -1, -1])

In [192]:
docs = docs[1].tolist()

In [193]:
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [194]:
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

[[94, 5, 3498, 1, 11, 16, 250, 336, 5, 655, 124, 88, 5, 150, 2796, 29, 10, 424, 1, 11, 10, 747], [840, 336, 5, 876, 3, 1841, 39, 193, 2, 250, 1102, 100, 292, 63, 574, 1842, 3, 126, 5, 3499, 96, 748, 3, 1186, 575, 4, 749, 1, 929, 19], [1, 293, 656, 142, 11, 337, 16, 1284, 285, 3500, 2, 167, 13, 15, 972, 279, 5011, 5, 147, 1502, 1, 11, 2352, 1, 5012, 2, 15, 267, 973, 1, 877, 5013, 251], [17, 1, 48, 88, 101, 1, 11, 135, 115, 15, 268, 5, 1503, 1, 154, 115, 3, 478, 4, 135, 657, 1, 211, 2, 687, 526, 4, 1641, 115, 1, 88, 1187], [94, 5, 1, 11, 9, 2353, 349, 7, 1, 260, 41, 1188, 303, 1103, 6, 218, 272, 30, 27, 197, 3, 1, 373, 2, 151, 275, 17, 39, 42, 31, 508, 2, 84, 151, 2, 30, 27], [878, 2, 554, 9, 197, 554, 10, 5014, 5015, 15, 197, 349, 18, 2354, 1504, 14, 3501, 116, 1843, 5016, 1505, 1189, 1844, 2797], [7, 1, 155, 51, 2, 53, 634, 9, 30, 27, 2070, 5, 5017, 13, 5018, 7, 1, 164, 50, 6, 32, 147, 180, 26, 2355, 5, 6, 5019, 799, 841, 31, 13, 6, 799, 841, 75, 2, 2798], [3, 1, 157, 51, 2, 53, 30, 27

In [195]:
max_length = 250
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[  94    5 3498 ...    0    0    0]
 [ 840  336    5 ...    0    0    0]
 [   1  293  656 ...    0    0    0]
 ...
 [  42   31  242 ...    0    0    0]
 [  30   27    2 ...    0    0    0]
 [  27    3   35 ...    0    0    0]]


In [196]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.50d.txt')
# f = open('glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    print(word)
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

the
,
.
of
to
and
in
a
"
's
for
-
that
on
is
was
said
with
he
as
it
by
at
(
)
from
his
''
``
an
be
has
are
have
but
were
not
this
who
they
had
i
which
will
their
:
or
its
one
after
new
been
also
we
would
two
more
'
first
about
up
when
year
there
all
--
out
she
other
people
n't
her
percent
than
over
into
last
some
government
time
$
you
years
if
no
world
can
three
do
;
president
only
state
million
could
us
most
_
against
u.s.
so
them
what
him
united
during
before
may
since
many
while
where
states
because
now
city
made
like
between
did
just
national
day
country
under
such
second
then
company
group
any
through
china
four
being
down
war
back
off
south
american
minister
police
well
including
team
international
week
officials
still
both
even
high
part
told
those
end
former
these
make
billion
work
our
home
school
party
house
old
later
get
another
tuesday
news
long
five
called
1
wednesday
military
way
used
much
next
monday
thursday
friday
game
here
?
should
take
very
my
north
security
season
yo

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2273: character maps to <undefined>

In [197]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [198]:
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=250, trainable=False)

In [199]:
model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(4845, activation='relu'))
model.add(Dense(3500, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(700, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='softmax'))

In [200]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [201]:
print(model.summary())

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 250, 50)           506150    
_________________________________________________________________
flatten_14 (Flatten)         (None, 12500)             0         
_________________________________________________________________
dense_52 (Dense)             (None, 250)               3125250   
_________________________________________________________________
dense_53 (Dense)             (None, 4845)              1216095   
_________________________________________________________________
dense_54 (Dense)             (None, 3500)              16961000  
_________________________________________________________________
dense_55 (Dense)             (None, 2000)              7002000   
_________________________________________________________________
dense_56 (Dense)             (None, 1500)            

In [None]:
model.fit(padded_docs, sentiment_labels, epochs=70)

Train on 4846 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70

In [None]:
loss, accuracy = model.evaluate(padded_docs, sentiment_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))