In [1]:
import pandas as pd
import numpy as np
import json
import itertools
from keras.preprocessing import text, sequence
from keras import utils
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [2]:

import numpy
from keras.datasets import imdb
from matplotlib import pyplot
# load the dataset
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words=10000)




Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [3]:
print("train_data ", train_data.shape)
print("train_labels ", train_labels.shape)
print("_"*100)
print("test_data ", test_data.shape)
print("test_labels ", test_labels.shape)
print("_"*100)
print("Maximum value of a word index ")
print(max([max(sequence) for sequence in train_data]))
print("Maximum length num words of review in train ")
print(max([len(sequence) for sequence in train_data]))

train_data  (25000,)
train_labels  (25000,)
____________________________________________________________________________________________________
test_data  (25000,)
test_labels  (25000,)
____________________________________________________________________________________________________
Maximum value of a word index 
9999
Maximum length num words of review in train 
2494


In [4]:
# See an actual review in words
# Reverse from integers to words using the DICTIONARY (given by keras...need to do nothing to create it)

word_index = imdb.get_word_index()

reverse_word_index = dict(
[(value, key) for (key, value) in word_index.items()])

decoded_review = ' '.join(
[reverse_word_index.get(i - 3, '?') for i in train_data[123]])

print(decoded_review)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
? beautiful and touching movie rich colors great settings good acting and one of the most charming movies i have seen in a while i never saw such an interesting setting when i was in china my wife liked it so much she asked me to ? on and rate it so other would enjoy too


In [0]:
# VECTORIZE as one cannot feed integers into a NN 
# Encoding the integer sequences into a binary matrix - one hot encoder basically
# From integers representing words, at various lengths - to a normalized one hot encoded tensor (matrix) of 10k columns

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


In [6]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

print("x_train ", x_train.shape)
print("x_test ", x_test.shape)


x_train  (25000, 10000)
x_test  (25000, 10000)


In [7]:
# VECTORIZE the labels too - NO INTEGERS only floats into a tensor...(rare exceptions)

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
print("y_train ", y_train.shape)
print("y_test ", y_test.shape)





y_train  (25000,)
y_test  (25000,)


In [8]:
batch_size = 32
epochs = 2
# Build the model

model = Sequential()
model.add(Dense(512, input_shape=(10000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 22500 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


In [15]:
from google.colab import files
uploaded=files.upload()


Saving CompleteSentiments.json to CompleteSentiments (1).json


In [0]:
df=pd.read_json("CompleteSentiments (1).json")

In [56]:
df.head(5)

Unnamed: 0,sentiment,text
0,positive,"Thank you, Ellen. We have a strong 2018, with ..."
1,positive,Stock-based compensation totaled $2.3 billion....
10,negative,"For the full-year 2018, Other Bets revenues we..."
100,neutral,Great. Thank you. I just wanted to follow up o...
1000,positive,In 2018 there were several Restock Kroger succ...


In [78]:
df['sentiment'].value_counts()

neutral     832
positive    654
negative    157
Name: sentiment, dtype: int64

In [0]:
newdf= df[df.sentiment != 'neutral']


In [80]:
newdf['sentiment'].value_counts()

positive    654
negative    157
Name: sentiment, dtype: int64

In [81]:
test_size = int(len(newdf) * 1)
print ("Test size: %d" % test_size)


Test size: 812


In [0]:
test_posts = newdf['text'][:test_size]
test_tags = newdf['sentiment'][:test_size]



In [0]:
max_words = 10000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [0]:
tokenize.fit_on_texts(test_posts) # only fit on train
x_test = tokenize.texts_to_matrix(test_posts)


In [89]:
y_test=newdf['sentiment']
y_test

0       positive
1       positive
10      negative
1000    positive
1001    positive
1003    positive
1004    positive
1005    positive
1008    negative
101     positive
1010    positive
1011    negative
1013    positive
1014    positive
1015    negative
1016    positive
1017    positive
1018    positive
1019    negative
102     positive
1020    positive
1028    positive
1029    positive
1032    positive
1033    positive
1036    positive
1038    positive
1039    positive
104     positive
1040    positive
          ...   
943     positive
944     negative
945     positive
951     negative
952     positive
954     negative
959     positive
961     positive
964     positive
965     positive
969     positive
971     positive
972     positive
973     positive
975     positive
976     positive
977     positive
978     positive
979     positive
981     positive
983     positive
984     positive
985     negative
986     negative
989     negative
991     positive
992     positive
993     negati

In [90]:
y_test=pandas.factorize( y_test)[0]

y_test

array([ 0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  0,
        0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
        1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,
        1,  0,  0,  1,  1,  1,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,  1,
        0,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  1,  1,  1,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  1,  0,  1,
        0,  0,  0,  0,  0,  0,  1,  1,  1,  0,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  1,  0,  1,  1,  0,  0,  0,
        1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
        0,  1,  0,  1,  1,  0,  1,  0,  0,  0,  1,  1,  0,  0,  0,  1,  1,
        1,  1,  0,  1,  1,  1,  1,  1,  1,  0,  1,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  1,  0,
        0,  0,  1,  0,  1

In [0]:
import pandas

In [91]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_test shape:', x_test.shape)

print('y_test shape:', y_test.shape)


x_test shape: (812, 10000)
y_test shape: (812,)


In [92]:

score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 12.8795623967213
Test accuracy: 0.1933497535477718


In [93]:
y_pred=model.predict(x_test)
y_pred

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [105]:
cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix


array([[  0,   0,   1],
       [  0,   0, 654],
       [  0,   0, 157]])