In [0]:
from __future__ import absolute_import, division, print_function

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import collections
import os
import string
import time

In [0]:
import keras
import sklearn
import tensorflow as tf

from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [0]:
wocka_df = pd.read_json('data/joke-dataset/wocka.json')
wocka_df = wocka_df.loc[wocka_df["body"] != ""]
ss_df = pd.read_json('data/joke-dataset/stupidstuff.json')
ss_df = ss_df.loc[ss_df["body"] != ""]

In [0]:
wocka_ol = wocka_df.loc[wocka_df["category"] == "One Liners"]

In [0]:
ss_ol = ss_df.loc[ss_df["category"] == "One Liners"]

In [0]:
ss_ol

Unnamed: 0,body,category,id,rating
407,IN GENERAL\n1. Never take a beer to a job inte...,One Liners,408,3.0
495,What does the dentist of the year get?...A lit...,One Liners,496,2.5
593,1.I'm busy. You're ugly. Have a nice day.\n2.W...,One Liners,594,5.0
1672,I hope you never get a tetanus shot; maybe you...,One Liners,1673,2.0
3522,Great to use in chat rooms or on cell phones!,One Liners,3523,2.0
3536,Great for those Email Signatures,One Liners,3537,2.0


In [0]:
wocka_ol

Unnamed: 0,body,category,id,title
4,"If the opposite of pro is con, isn't the oppos...",One Liners,6,Progress
10,There was a dyslexic insomniac agnostic.\r\n\r...,One Liners,12,Pondering the afterlife
15,Do infants enjoy infancy as much as adults enj...,One Liners,17,Infants vs Adults
16,"If a pig loses its voice, is it disgruntled?",One Liners,18,Pig
47,"Before you judge someone, you should walk a mi...",One Liners,54,Judgement
48,"When cryptography is outlawed, bayl bhgynjf jv...",One Liners,55,Bayl
62,Two guys walked into a bar... you would have t...,One Liners,69,Two Guys
66,The darkest hours come just before the dawn.\r...,One Liners,73,The Darkest Hours
67,"Remember, no-one is listening until you fart.",One Liners,74,Always Remember
68,If at first you don't succeed ... avoid skydiv...,One Liners,75,If at First...


In [0]:
wocka_ol_list = wocka_ol["body"].tolist()

# Wocka Dataset One Liners Data Preprocessing

### Removing punctation, lowercasing, removing numbers, etc.

In [0]:
for i in range(len(wocka_ol_list)):
    joke = wocka_ol_list[i]
    joke = joke.replace("\r", " ").replace("\n", " ").replace("/", " or ")
    joke = joke.translate(str.maketrans("", "", string.punctuation)) # Remove punctuation
    joke = joke.lower() # Lowercase
    joke = ''.join(char for char in joke if not char.isdigit()) # Remove numbers
    joke = joke.strip() # Remove leading and ending whitespace
    wocka_ol_list[i] = joke

In [0]:
wocka_ol_list

### Removing jokes with length less than 5 and greater than 30

In [0]:
wocka_ol_joke_lens = [len(wocka_ol_list[i].split()) for i in range(len(wocka_ol_list))]
print("{} one liner jokes from Wocka".format(len(wocka_ol_joke_lens)))
print("{} one liner joke between 5 and 30 words from Wocka".format(len([i for i in wocka_ol_joke_lens if i > 4 and i < 31])))

917 one liner jokes from Wocka
772 one liner joke between 5 and 30 words from Wocka


In [0]:
short_wocka_ol = [joke for joke in wocka_ol_list if len(joke.split()) < 31 and len(joke.split()) > 4]

In [0]:
short_wocka_ol

['if the opposite of pro is con isnt the opposite of progress congress',
 'there was a dyslexic insomniac agnostic    he laid awake all night wondering if there really was a dog',
 'do infants enjoy infancy as much as adults enjoy adultery',
 'if a pig loses its voice is it disgruntled',
 'before you judge someone you should walk a mile in their shoes   that way when you judge them youre a mile away and you have their shoes',
 'when cryptography is outlawed bayl bhgynjf jvyy unir cevinpl',
 'two guys walked into a bar you would have thought the second one would have ducked',
 'the darkest hours come just before the dawn    so if youre going to steal your neighbours milk and newspaper thats the time to do it',
 'remember noone is listening until you fart',
 'if at first you dont succeed  avoid skydiving',
 'give a man a fish and he will eat for a day    teach him how to fish and he will sit in a boat and drink beer all day',
 'never play leapfrog with a unicorn',
 'politics comes from t

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(short_wocka_ol)
word2idx = tokenizer.word_index.copy()
idx2word = {key: value for value, key in word2idx.items()}

encoded = tokenizer.texts_to_sequences(short_wocka_ol)
#encoded_keywords = tokenizer.texts_to_sequences(keywords)

vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

Vocab size: 2769


### Find keywords for each joke using TF-IDF

In [0]:
vectorizer = TfidfVectorizer(max_df=1)
tfidf = vectorizer.fit_transform(short_wocka_ol)
tfidf_word2idx = vectorizer.vocabulary_.copy()
tfidf_idx2word = {key: value for value, key in tfidf_word2idx.items()}

In [0]:
keywords = [[tfidf_idx2word[j], tfidf_idx2word[i]] for i, j in [np.argpartition(joke.toarray()[0], -2)[-2:] for joke in tfidf]]

In [0]:
keywords

[['opposite', 'con'],
 ['agnostic', 'dyslexic'],
 ['infants', 'infancy'],
 ['disgruntled', 'voice'],
 ['zoology', 'floor'],
 ['bayl', 'cryptography'],
 ['ducked', 'fluently'],
 ['darkest', 'floor'],
 ['noone', 'floor'],
 ['avoid', 'fluently'],
 ['sit', 'teach'],
 ['leapfrog', 'unicorn'],
 ['means', 'poli'],
 ['atms', 'drivethru'],
 ['closed', 'mascara'],
 ['depression', 'enthusiasm'],
 ['early', 'worm'],
 ['soar', 'sucked'],
 ['cholesterol', 'floor'],
 ['relatives', 'zoology'],
 ['freeway', 'fluently'],
 ['zoology', 'floor'],
 ['explain', 'fluently'],
 ['zoology', 'floor'],
 ['zoology', 'floor'],
 ['sword', 'fluently'],
 ['zoology', 'floor'],
 ['complaining', 'heror'],
 ['hates', 'competition'],
 ['abbreviation', 'floor'],
 ['started', 'grandmother'],
 ['crosscountry', 'floor'],
 ['exercise', 'floor'],
 ['lawyers', 'creed'],
 ['slim', 'zoology'],
 ['income', 'zoology'],
 ['breeds', 'floor'],
 ['million', 'dollar'],
 ['neutral', 'robert'],
 ['maternityward', 'zoology'],
 ['youd', 'rathe

In [0]:
flat = [item for sublist in keywords for item in sublist]
print("{} number of keywords".format(len(flat)))
print("{} number of unique keywords".format(len(set(flat))))

1544 number of keywords
1202 number of unique keywords


### Build Model

In [0]:
sequences = list()
for joke_encoding in encoded:
    sequence = []
    for i in range(1, len(joke_encoding)):
        pair = joke_encoding[i-1:i+1]
        sequence.append(pair)
    sequences.append(sequence)
    
# Change this
sequences = [np.array(s) for s in sequences]
x, y = [s[:, 0] for s in sequences], [s[:, 1] for s in sequences]
# One-hot words
x = [tf.keras.utils.to_categorical(x[i], vocab_size + 1) for i in range(len(x))] # encoding is 1 indexed
y = [tf.keras.utils.to_categorical(y[i], vocab_size + 1) for i in range(len(y))]

In [0]:
embed_dim = 128
lstm_out = 200
batch_size = 32

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         354432    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dense_1 (Dense)              (None, 2769)              556569    
Total params: 1,174,201
Trainable params: 1,174,201
Non-trainable params: 0
_________________________________________________________________
None


# Create dataset for BERT fine-tuning

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
import os, sys
import json
import matplotlib.pyplot as plt
import pandas as pd
import string

sys.path.insert(0, "/content/gdrive/My Drive/Project-Lion/")
root_folder = "/content/gdrive/My Drive/Project-Lion/"

In [0]:
def clean_data(lst, in_place=True, keep_punc=""):
    if in_place:
        new_lst = lst
    else:
        new_lst = lst.copy()
        
    for i in range(len(new_lst)):
        joke = lst[i]
        joke = joke.replace("\r", " ").replace("\n", " ").replace("/", " or ")
        punc_to_remove = string.punctuation
        for punc in keep_punc:
          punc_to_remove = punc_to_remove.replace(punc, "")
        joke = joke.translate(str.maketrans("", "", punc_to_remove)) # Remove punctuation
        joke = joke.lower() # Lowercase
        joke = joke.strip() # Remove leading and ending whitespace
        new_lst[i] = joke
    return new_lst

In [0]:
df = pd.read_csv(root_folder + 'data/kaggle/shortjokes.csv')
df = df.loc[df["Joke"] != ""]

In [0]:
data = clean_data(df["Joke"].tolist(), keep_punc="?,':.")
print("Length of data:", len(data))
print(data[0])

Length of data: 231657
me narrating a documentary about narrators i can't hear what they're saying cuz i'm talking


In [0]:
train_size = len(data) * 6 // 7
train_data = data[:train_size]
val_data = data[train_size:]

In [0]:
for d in data:
  if "traffic light" in d and "changing" in d:
    print(d)

what did the traffic light say to the car? don't look at me i'm changing.
what did the traffic light say to the other traffic light? don't look i'm changing
q: why did the traffic light turn red? a: because it saw the other one changing
what does a traffic light tells to another traffic light? dont look at me i'm changing
why did the traffic light turn red? because it saw the other one changing
what did the other traffic light say to the other traffic light? don't look i'm changing
what did the traffic light say to the car? don't look i'm changing
lame, but funny. what did the traffic light say to the car? . . don't look, i am changing hahahaha xd
what did the traffic light say to the car? don't look i'm changing
why do traffic lights never go swimming? because they spend too much time changing.
what did the traffic light say to the car? don't look, i'm changing.


In [0]:
for d in data:
  if "become a hoe" in d:
    print(d)

In [0]:
[print(train_data[i]) for i in range(10, 20)]
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
[print(val_data[i]) for i in range(10, 20)]

telling my daugthers date that she has lice and its very contagious the closer you get to her. correct way to parent.
what should you do before criticizing pacman? waka waka waka mile in his shoes
what's the difference between an illegal mexican and an autonomous robot...? nothing... they were both made to steal american jobs.
what do you call a barbarian you can't see? an invisigoth.
how do you spell canda? c,eh,n,eh,d,eh
you ever notice that the most dangerous thing about marijuana is getting caught with it?
what did arnold schwarzenegger say at the abortion clinic? hasta last vista, baby.
my wife is in a bad mood. i think her boyfriend forgot their anniversary. way to go, dude. now we all suffer...
my speech today will be like a miniskirt. long enough to cover the essentials but short enough to hold your attention
thanksgiving joke what does miley cyrus eat for thanksgiving? twerky just kidding... drugs. she eats drugs. adam zopf adamzopf
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
some p

[None, None, None, None, None, None, None, None, None, None]

In [0]:
min_qlen, max_qlen = 3, 999
min_alen, max_alen = 3, 999
qa = [(q, a) for q, a in qa_all if len(q.split()) >= min_qlen and len(q.split()) <= max_qlen and len(a.split()) >= min_alen and len(a.split()) <= max_alen]
print("Size of dataset:", len(qa))
print(qa[0])

Size of dataset: 71354
("why can't barbie get pregnant", 'because ken comes in a different box heyooooooo')


In [0]:
f = open(root_folder + "bert_training_data.txt", "w")
for q, a in qa_all:
  f.write(q + "\n")
  f.write(a)
  f.write("\n\n")
f.close()

In [0]:
71354 / 35

2038.6857142857143