Importing all the necessary dependencies

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

Importing the dataset

In [2]:
rawDataset = pd.read_csv('data.csv')
rawDataset.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


Tokenizing the word vocab

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.oov_token = '<shobdobhandarErBahire>'
tokenizer.fit_on_texts(rawDataset['text'])
vocab = tokenizer.word_index
vocabSize = len(vocab)
vocabSize, vocab

(77353,
 {'<shobdobhandarErBahire>': 1,
  'the': 2,
  'a': 3,
  'to': 4,
  'you': 5,
  'in': 6,
  'of': 7,
  'i': 8,
  'and': 9,
  'what': 10,
  'is': 11,
  'for': 12,
  'do': 13,
  'on': 14,
  'it': 15,
  'my': 16,
  'with': 17,
  'why': 18,
  'how': 19,
  'your': 20,
  'are': 21,
  'did': 22,
  'was': 23,
  'that': 24,
  'at': 25,
  'have': 26,
  'about': 27,
  'like': 28,
  'he': 29,
  'call': 30,
  'when': 31,
  'an': 32,
  'they': 33,
  'be': 34,
  'his': 35,
  'because': 36,
  'this': 37,
  'from': 38,
  'me': 39,
  'get': 40,
  'trump': 41,
  'just': 42,
  'out': 43,
  'new': 44,
  'who': 45,
  'if': 46,
  'not': 47,
  'so': 48,
  'up': 49,
  'one': 50,
  'can': 51,
  'but': 52,
  "it's": 53,
  'say': 54,
  'people': 55,
  "don't": 56,
  'does': 57,
  "what's": 58,
  'all': 59,
  'photos': 60,
  "i'm": 61,
  'her': 62,
  'as': 63,
  'no': 64,
  'after': 65,
  'by': 66,
  'know': 67,
  'make': 68,
  'into': 69,
  'will': 70,
  'has': 71,
  'their': 72,
  'day': 73,
  'man': 74,
 

Testing the tokenized words

In [4]:
jokes = rawDataset['text'][0:5]
jokeVector = tokenizer.texts_to_sequences(jokes)
jokeVector, jokes

([[1062, 1878, 795, 43, 3480, 3884, 30456, 61, 47, 30457],
  [193, 40271, 656, 10807, 30458, 17, 1186, 3690],
  [10, 13, 5, 30, 3, 3405, 263, 148, 4215, 211],
  [116, 490, 2, 546, 371, 1785, 48, 1358],
  [40272, 205, 545, 370, 5958, 38, 454, 44, 7516, 302]],
 0    Joe biden rules out 2020 bid: 'guys, i'm not r...
 1    Watch: darvish gave hitter whiplash with slow ...
 2    What do you call a turtle without its shell? d...
 3        5 reasons the 2016 election feels so personal
 4    Pasco police shot mexican migrant from behind,...
 Name: text, dtype: object)

Sequencing the whole dataset and padding to uniform shape

In [5]:
textData = tokenizer.texts_to_sequences(rawDataset['text'])
paddedTextData = tf.keras.preprocessing.sequence.pad_sequences(textData, padding='pre')
paddedTextData.shape, paddedTextData

((200000, 25),
 array([[    0,     0,     0, ...,    61,    47, 30457],
        [    0,     0,     0, ...,    17,  1186,  3690],
        [    0,     0,     0, ...,   148,  4215,   211],
        ...,
        [    0,     0,     0, ...,    50,   114,    93],
        [    0,     0,     0, ...,    14,  4584,   217],
        [    0,     0,     0, ...,    28,   228,  1204]]))

Splitting into training and testing datasets

In [6]:
trainLength = 150000


xTrain = paddedTextData[:trainLength]
yTrain = rawDataset['humor'][:trainLength]

xTest = paddedTextData[trainLength:]
yTest = rawDataset['humor'][trainLength:]

Defining the model

In [7]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocabSize, 64, input_length=25))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Training the model

In [8]:
model.fit(xTrain, yTrain, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2a386a4b5b0>

Evaluating on testing dataset

In [9]:
model.evaluate(xTest, yTest)



[0.16175518929958344, 0.9422600269317627]

Real life testing

In [40]:
texts = ["Narendra Modi is the prime minister of India", "Did you hear about the mathematician who’s afraid of negative numbers? He’ll stop at nothing to avoid them."]

sequencedText = tokenizer.texts_to_sequences(texts)
print(sequencedText)
paddedText = tf.keras.preprocessing.sequence.pad_sequences(sequencedText, padding='pre', maxlen=25)
print(paddedText)
model.predict(paddedText)

[[31568, 15921, 11, 2, 2037, 2479, 7, 2243], [22, 5, 107, 27, 2, 2692, 22576, 678, 7, 2870, 1455, 21576, 171, 25, 312, 4, 866, 112]]
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0 31568 15921    11     2  2037  2479     7
   2243]
 [    0     0     0     0     0     0     0    22     5   107    27     2
   2692 22576   678     7  2870  1455 21576   171    25   312     4   866
    112]]


array([[6.7348604e-04],
       [9.9998260e-01]], dtype=float32)