# Introduction

This notebook does the intitial cleaning and generation of joke topics. Will be transformed into a script.

In [4]:
import os
import sys
import pickle
from importlib import reload
import numpy as np
import pandas as pd
from tqdm import tqdm

In [5]:
import requests
from sklearn.externals import joblib
import string
import h5py

In [8]:
PROJECT_DIR = os.path.join(os.getcwd(), os.pardir)
os.chdir(PROJECT_DIR)

In [9]:
sys.path.insert(0, "src")

In [10]:
import helpers
helpers = reload(helpers)

Using TensorFlow backend.


## Global Variables

In [11]:
CHARS_SEQS_OUT = "data/joke_char_sequences_Jan20.h5"
TOPICS_OUT = "data/joke_topics.pkl"
MODELER_OUT = "data/jokes_topic_modeler.pkl"
DICT_OUT = "data/char_dict_Jan20.pkl"

In [12]:
JOKES_URL = "https://onedrive.live.com/download?cid=ABD51044F5341265&resid=ABD51044F5341265%21112436&authkey=AFUOwOJbFyY6ZFM"
JOKE_PATH = "data/raw/reddit_jokes.csv"

# Read Jokes

In [14]:
#check if the jokes file is available, and download it if it is not
if not os.path.exists(JOKE_PATH):
    print("Jokes file not found, so downloading it.")
    response = requests.get(JOKES_URL, allow_redirects=True)
    with open(JOKE_PATH, 'wb') as f:
        f.write(response.content)
    print("Jokes downloaded.")

Jokes file not found, so downloading it.
Jokes downloaded


In [10]:
#now jokes
jokes = pd.read_csv("data/raw/reddit_jokes.csv")
#only funy ones
jokes = jokes[jokes["ups"]>=10]

In [11]:
#get rid of joke with very little text
jokes = jokes[jokes["text"].str.len() >=4]
jokes = jokes["title"].str.cat(jokes["text"], sep="\n\n")
jokes = jokes.tolist()

In [12]:
len(jokes)

109095

In [13]:
jokes = np.array(jokes)

In [14]:
jokes.shape

(109095,)

Nest time will keep ups for weights..

# Create Topics

In [15]:
%%time
modeler = helpers.TopicModeler(n_topics=32)
topics = modeler.fit_transform(jokes)

Wall time: 42min 14s


In [16]:
topics.shape

(109095, 32)

In [17]:
#now save topics
joblib.dump(topics, TOPICS_OUT)

['data/joke_topics.pkl']

In [18]:
#and topic modeller (Encoder?)

In [29]:
joblib.dump(modeler, MODELER_OUT)

['data/jokes_topic_modeler.pkl']

# Create Char Seqs

In [20]:
#pre make dict
char_dict = {"<BOUND>":0}
for ix, char in enumerate(string.printable):
    if char not in ('\x0c', '\x0b', "\r"):
        char_dict[char] = ix+1

In [21]:
len(char_dict)

98

In [24]:
#now save
pickle_out = open(DICT_OUT,"wb")
pickle.dump(char_dict, pickle_out)
pickle_out.close()

In [27]:
f = h5py.File(CHARS_SEQS_OUT, 'w')
dt = h5py.special_dtype(vlen=np.dtype('int32'))
dset = f.create_dataset('seqs', (len(jokes),), dtype=dt)
for i in tqdm(range(len(jokes))):
    text=jokes[i]
    #make certain all strings
    seq = [char_dict["<BOUND>"]]
    for char in text:
        if char in string.printable:
            try:
                seq.append(char_dict[char])
            except KeyError:
                pass
    dset[i]=np.array(seq, dtype="int32")

100%|████████████████████████████████████████████████████████████████████████| 109095/109095 [01:16<00:00, 1420.19it/s]


In [28]:
f.close()