In [92]:
from __future__ import absolute_import, division, print_function

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import collections
import os
import string
import time

In [334]:
import keras
import sklearn
import tensorflow as tf

from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
wocka_df = pd.read_json('data/joke-dataset/wocka.json')
wocka_df = wocka_df.loc[wocka_df["body"] != ""]
ss_df = pd.read_json('data/joke-dataset/stupidstuff.json')
ss_df = ss_df.loc[ss_df["body"] != ""]

In [41]:
wocka_ol = wocka_df.loc[wocka_df["category"] == "One Liners"]

In [42]:
ss_ol = ss_df.loc[ss_df["category"] == "One Liners"]

In [39]:
ss_ol

Unnamed: 0,body,category,id,rating
407,IN GENERAL\n1. Never take a beer to a job inte...,One Liners,408,3.0
495,What does the dentist of the year get?...A lit...,One Liners,496,2.5
593,1.I'm busy. You're ugly. Have a nice day.\n2.W...,One Liners,594,5.0
1672,I hope you never get a tetanus shot; maybe you...,One Liners,1673,2.0
3522,Great to use in chat rooms or on cell phones!,One Liners,3523,2.0
3536,Great for those Email Signatures,One Liners,3537,2.0


In [46]:
wocka_ol

Unnamed: 0,body,category,id,title
4,"If the opposite of pro is con, isn't the oppos...",One Liners,6,Progress
10,There was a dyslexic insomniac agnostic.\r\n\r...,One Liners,12,Pondering the afterlife
15,Do infants enjoy infancy as much as adults enj...,One Liners,17,Infants vs Adults
16,"If a pig loses its voice, is it disgruntled?",One Liners,18,Pig
47,"Before you judge someone, you should walk a mi...",One Liners,54,Judgement
48,"When cryptography is outlawed, bayl bhgynjf jv...",One Liners,55,Bayl
62,Two guys walked into a bar... you would have t...,One Liners,69,Two Guys
66,The darkest hours come just before the dawn.\r...,One Liners,73,The Darkest Hours
67,"Remember, no-one is listening until you fart.",One Liners,74,Always Remember
68,If at first you don't succeed ... avoid skydiv...,One Liners,75,If at First...


In [140]:
wocka_ol_list = wocka_ol["body"].tolist()

# Wocka Dataset One Liners Data Preprocessing

### Removing punctation, lowercasing, removing numbers, etc.

In [142]:
for i in range(len(wocka_ol_list)):
    joke = wocka_ol_list[i]
    joke = joke.replace("\r", " ").replace("\n", " ").replace("/", " or ")
    joke = joke.translate(str.maketrans("", "", string.punctuation)) # Remove punctuation
    joke = joke.lower() # Lowercase
    joke = ''.join(char for char in joke if not char.isdigit()) # Remove numbers
    joke = " ".join(joke.split()) # Split and rejoin text as 
    wocka_ol_list[i] = joke

In [143]:
wocka_ol_list

['if the opposite of pro is con isnt the opposite of progress congress',
 'there was a dyslexic insomniac agnostic he laid awake all night wondering if there really was a dog',
 'do infants enjoy infancy as much as adults enjoy adultery',
 'if a pig loses its voice is it disgruntled',
 'before you judge someone you should walk a mile in their shoes that way when you judge them youre a mile away and you have their shoes',
 'when cryptography is outlawed bayl bhgynjf jvyy unir cevinpl',
 'two guys walked into a bar you would have thought the second one would have ducked',
 'the darkest hours come just before the dawn so if youre going to steal your neighbours milk and newspaper thats the time to do it',
 'remember noone is listening until you fart',
 'if at first you dont succeed avoid skydiving',
 'give a man a fish and he will eat for a day teach him how to fish and he will sit in a boat and drink beer all day',
 'never play leapfrog with a unicorn',
 'politics comes from the root poli

### Removing jokes with length less than 5 and greater than 30

In [318]:
wocka_ol_joke_lens = [len(wocka_ol_list[i].split()) for i in range(len(wocka_ol_list))]
print("{} one liner jokes from Wocka".format(len(wocka_ol_joke_lens)))
print("{} one liner joke between 5 and 30 words from Wocka".format(len([i for i in wocka_ol_joke_lens if i > 4 and i < 31])))

917 one liner jokes from Wocka
772 one liner joke between 5 and 30 words from Wocka


In [310]:
short_wocka_ol = [joke for joke in wocka_ol_list if len(joke.split()) < 31 and len(joke.split()) > 4]

In [159]:
short_wocka_ol

['if the opposite of pro is con isnt the opposite of progress congress',
 'there was a dyslexic insomniac agnostic he laid awake all night wondering if there really was a dog',
 'do infants enjoy infancy as much as adults enjoy adultery',
 'if a pig loses its voice is it disgruntled',
 'before you judge someone you should walk a mile in their shoes that way when you judge them youre a mile away and you have their shoes',
 'when cryptography is outlawed bayl bhgynjf jvyy unir cevinpl',
 'two guys walked into a bar you would have thought the second one would have ducked',
 'the darkest hours come just before the dawn so if youre going to steal your neighbours milk and newspaper thats the time to do it',
 'remember noone is listening until you fart',
 'if at first you dont succeed avoid skydiving',
 'give a man a fish and he will eat for a day teach him how to fish and he will sit in a boat and drink beer all day',
 'never play leapfrog with a unicorn',
 'politics comes from the root poli

### Find keywords for each joke using TF-IDF

In [275]:
vect = TfidfVectorizer(max_df=1)
tfidf = vect.fit_transform(short_wocka_ol)

In [316]:
word2idx = vect.vocabulary_.copy()
idx2word = {key: value for value, key in word2idx.items()}
vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

Vocab size: 1807


In [295]:
keywords = [[idx2word[j], idx2word[i]] for i, j in [np.argpartition(joke.toarray()[0], -2)[-2:] for joke in tfidf]]

In [296]:
keywords

[['opposite', 'con'],
 ['agnostic', 'dyslexic'],
 ['infants', 'infancy'],
 ['disgruntled', 'voice'],
 ['zoology', 'floor'],
 ['bayl', 'cryptography'],
 ['ducked', 'fluently'],
 ['darkest', 'floor'],
 ['noone', 'floor'],
 ['avoid', 'fluently'],
 ['sit', 'teach'],
 ['leapfrog', 'unicorn'],
 ['means', 'poli'],
 ['atms', 'drivethru'],
 ['closed', 'mascara'],
 ['depression', 'enthusiasm'],
 ['early', 'worm'],
 ['soar', 'sucked'],
 ['cholesterol', 'floor'],
 ['relatives', 'zoology'],
 ['freeway', 'fluently'],
 ['zoology', 'floor'],
 ['explain', 'fluently'],
 ['zoology', 'floor'],
 ['zoology', 'floor'],
 ['sword', 'fluently'],
 ['zoology', 'floor'],
 ['complaining', 'heror'],
 ['hates', 'competition'],
 ['abbreviation', 'floor'],
 ['started', 'grandmother'],
 ['crosscountry', 'floor'],
 ['exercise', 'floor'],
 ['lawyers', 'creed'],
 ['slim', 'zoology'],
 ['income', 'zoology'],
 ['breeds', 'floor'],
 ['million', 'dollar'],
 ['neutral', 'robert'],
 ['maternityward', 'zoology'],
 ['youd', 'rathe

In [307]:
flat = [item for sublist in keywords for item in sublist]
print("{} number of keywords".format(len(flat)))
print("{} number of unique keywords".format(len(set(flat))))

1544 number of keywords
1202 number of unique keywords


### Build Model

In [355]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(short_wocka_ol)
encoded = tokenizer.texts_to_sequences(short_wocka_ol)
#encoded_keywords = tokenizer.texts_to_sequences(keywords)

sequences = list()
for joke_encoding in encoded:
    sequence = []
    for i in range(1, len(joke_encoding)):
        pair = joke_encoding[i-1:i+1]
        sequence.append(pair)
    sequences.append(sequence)

In [356]:
sequences

[[[16, 2],
  [2, 607],
  [607, 8],
  [8, 1029],
  [1029, 6],
  [6, 1030],
  [1030, 216],
  [216, 2],
  [2, 607],
  [607, 8],
  [8, 608],
  [608, 1031]],
 [[42, 33],
  [33, 1],
  [1, 1032],
  [1032, 609],
  [609, 1033],
  [1033, 31],
  [31, 414],
  [414, 1034],
  [1034, 39],
  [39, 307],
  [307, 610],
  [610, 16],
  [16, 42],
  [42, 115],
  [115, 33],
  [33, 1],
  [1, 159]],
 [[11, 1035],
  [1035, 415],
  [415, 1036],
  [1036, 65],
  [65, 116],
  [116, 65],
  [65, 611],
  [611, 415],
  [415, 1037]],
 [[16, 1],
  [1, 612],
  [612, 416],
  [416, 48],
  [48, 1038],
  [1038, 6],
  [6, 10],
  [10, 1039]],
 [[100, 3],
  [3, 308],
  [308, 79],
  [79, 3],
  [3, 117],
  [117, 144],
  [144, 1],
  [1, 309],
  [309, 9],
  [9, 66],
  [66, 247],
  [247, 14],
  [14, 105],
  [105, 17],
  [17, 3],
  [3, 308],
  [308, 101],
  [101, 63],
  [63, 1],
  [1, 309],
  [309, 248],
  [248, 5],
  [5, 3],
  [3, 18],
  [18, 66],
  [66, 247]],
 [[17, 1040],
  [1040, 6],
  [6, 1041],
  [1041, 1042],
  [1042, 1043],
  

In [None]:
maxlen = 30
sentences = []
next_chars = []
for i in wocka_ol_list:
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(wocka_ol_list), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(wocka_ol_list), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, word2idx[char]] = 1
    y[i, word2idx[next_chars[i]]] = 1

In [331]:
embed_dim = 128
lstm_out = 200
batch_size = 32

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         231296    
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dense_2 (Dense)              (None, 1807)              363207    
Total params: 857,703
Trainable params: 857,703
Non-trainable params: 0
_________________________________________________________________
None


https://datascience.stackexchange.com/questions/26366/training-an-rnn-with-examples-of-different-lengths-in-keras