In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [2]:
data_source   = 'data_sarcasm/sarcasm_dataset_1.json'
training_size = 20000
oov_token     = '<OOV>'
trunc_type    = 'post'
pad_type      = 'post'
max_length    = 50
vocab_size    = 15000
embedding_dim = 10
num_epochs    = 10

In [3]:
data      = pd.read_json(data_source, lines=True)
sentences = data.headline.to_list()
labels    = data.is_sarcastic.to_list()

In [4]:
training_sentences = sentences[:training_size]
testing_sentences  = sentences[training_size:]
training_labels    = labels[:training_size]
testing_labels     = labels[training_size:]

In [5]:
tokenizer  = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [6]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded    = pad_sequences(training_sequences, 
                                   padding   =pad_type, 
                                   truncating=trunc_type,
                                   maxlen    =max_length)

In [7]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded    = pad_sequences(testing_sequences, 
                                  padding   =pad_type, 
                                  truncating=trunc_type,
                                  maxlen    =max_length)

In [8]:
tf.keras.layers.Embedding??

In [9]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim   =vocab_size, 
                                                       output_dim  =embedding_dim, 
                                                       input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

W0322 23:02:19.284435 140454871394112 deprecation.py:506] From /home/somesh/anaconda3/envs/tf_gpu/lib/python3.7/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0322 23:02:19.286146 140454871394112 deprecation.py:506] From /home/somesh/anaconda3/envs/tf_gpu/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0322 23:02:19.350674 140454871394112 deprecation.py:323] From /home/somesh/anaconda3/envs/tf_gpu/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl

In [10]:
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels),
                    verbose=2)

Train on 20000 samples, validate on 6709 samples
Epoch 1/10
20000/20000 - 2s - loss: 0.6282 - acc: 0.6403 - val_loss: 0.4860 - val_acc: 0.8244
Epoch 2/10
20000/20000 - 1s - loss: 0.3684 - acc: 0.8577 - val_loss: 0.3609 - val_acc: 0.8520
Epoch 3/10
20000/20000 - 2s - loss: 0.2660 - acc: 0.8987 - val_loss: 0.3413 - val_acc: 0.8563
Epoch 4/10
20000/20000 - 1s - loss: 0.2096 - acc: 0.9229 - val_loss: 0.3432 - val_acc: 0.8539
Epoch 5/10
20000/20000 - 2s - loss: 0.1701 - acc: 0.9380 - val_loss: 0.3662 - val_acc: 0.8506
Epoch 6/10
20000/20000 - 1s - loss: 0.1394 - acc: 0.9513 - val_loss: 0.3754 - val_acc: 0.8551
Epoch 7/10
20000/20000 - 1s - loss: 0.1156 - acc: 0.9614 - val_loss: 0.4165 - val_acc: 0.8471
Epoch 8/10
20000/20000 - 2s - loss: 0.0963 - acc: 0.9682 - val_loss: 0.4308 - val_acc: 0.8496
Epoch 9/10
20000/20000 - 1s - loss: 0.0803 - acc: 0.9750 - val_loss: 0.4596 - val_acc: 0.8469
Epoch 10/10
20000/20000 - 1s - loss: 0.0676 - acc: 0.9799 - val_loss: 0.5006 - val_acc: 0.8448


In [11]:
sentence = ['granny starting to fear spiders in the garden might be real',
            'the weather today is bright and sunny']
sequences = tokenizer.texts_to_sequences(sentence)
padded    = pad_sequences(sequences, 
                          truncating=trunc_type, 
                          maxlen    =max_length,
                          padding   =pad_type)
model.predict(padded)

array([[7.9515463e-01],
       [5.6674390e-04]], dtype=float32)