In [31]:
import random, string

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import TextVectorization, Embedding

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import matplotlib.pyplot as plt

# try:
#     tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#     print('Device:', tpu.master())
#     tf.config.experimental_connect_to_cluster(tpu)
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     strategy = tf.distribute.experimental.TPUStrategy(tpu)
# except:
#     strategy = tf.distribute.get_strategy()
# print('Number of replicas:', strategy.num_replicas_in_sync)

print(tf.__version__)

2.9.1


In [32]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
# GCS_PATH = KaggleDatasets().get_gcs_path()
# BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [33]:
data = pd.read_csv("./data/dataset.csv")
data.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [34]:
# 0 -> False, 1 -> True (for humor)
le = LabelEncoder()
data["humor"] = le.fit_transform(data["humor"].astype(str))

In [35]:
X = data.text.values
y = data.humor.values

In [36]:
sent_lens = [len(sentence.split()) for sentence in X]
avg_sent_lens = np.mean(sent_lens)
avg_sent_lens

11.983325

In [37]:
output_seq_length = int(np.percentile(sent_lens, 95))
output_seq_length

17

In [38]:
max_tokens = 65000

text_vectorizer = TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=output_seq_length
)

text_vectorizer.adapt(X)

In [39]:
def split_chars(text):
    return " ".join(list(text))

In [40]:
humor_text_vocab = text_vectorizer.get_vocabulary()

token_embed = Embedding(
    input_dim=len(humor_text_vocab),
    output_dim=128,
    mask_zero=True,
    name="token_embedding"
)

In [41]:
tf_hub_embedding_layer = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder/4",
    trainable=False,
    name="universal_sentence_encoder"
)

In [42]:
%%time
X_chars = [split_chars(sentence) for sentence in X]

J o e   b i d e n   r u l e s   o u t   2 0 2 0   b i d :   ' g u y s ,   i ' m   n o t   r u n n i n g '
CPU times: user 467 ms, sys: 30.5 ms, total: 497 ms
Wall time: 505 ms


In [43]:
alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet

'abcdefghijklmnopqrstuvwxyz0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
# Average character length per sentence
char_lens = [len(sentence) for sentence in X]
mean_char_len = np.mean(char_lens)
mean_char_len

67.470575

In [44]:
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len

92

In [45]:
NUM_CHAR_TOKENS = len(alphabet) + 2 # alphabet + space + OOV token

char_vectorizer = TextVectorization(
    max_tokens=NUM_CHAR_TOKENS,
    output_sequence_length=output_seq_char_len,
    standardize="lower_and_strip_punctuation",
    name="char_vectorizer"
)

char_vectorizer.adapt(X_chars)

In [46]:
char_embed = Embedding(
    input_dim=NUM_CHAR_TOKENS,
    output_dim=25,
    mask_zero=False,
    name="char_embed"
)

In [50]:
# 1. Token inputs/model
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_input")
token_embeddings = tf_hub_embedding_layer(token_inputs)
token_output = layers.Dense(128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(token_inputs, token_output)

# 2. Char inputs/model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(25))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

# 3. Concat
token_char_concat = layers.Concatenate(name="token_char_hybrid")(
    [token_model.output, char_model.output]
)

# 4. Dropout layers
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(200, activation="relu")(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation="sigmoid")(final_dropout)

# 5. Create model
model_4 = tf.keras.Model(
    [token_model.input, char_model.input],
    output_layer,
    name="model_4_token_and_char_embeddings"
)

model_4.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy",
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.AUC()
    ]
)

model_4.summary()

Model: "model_4_token_and_char_embeddings"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 token_input (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 char_vectorizer (TextVectoriza  (None, 92)          0           ['char_input[0][0]']             
 tion)                                                                                            
                                                                                                  
 universal_sentence_encoder (Ke  (None, 512)         256797824   [

In [51]:
train_char_token_data = tf.data.Dataset.from_tensor_slices((X, X_chars))
train_char_token_labels = tf.data.Dataset.from_tensor_slices(y)
train_char_token_ds = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)).batch(32).prefetch(AUTOTUNE)

train_char_token_ds

<PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None)), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [52]:
model_4_history = model_4.fit(
    train_char_token_ds,
    steps_per_epoch=int(len(train_char_token_ds)*1),
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [55]:
model_4_history = model_4.fit(
    train_char_token_ds,
    steps_per_epoch=int(len(train_char_token_ds)*1),
    epochs=15
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [56]:
model_4.save("final_model")



INFO:tensorflow:Assets written to: final_model/assets


INFO:tensorflow:Assets written to: final_model/assets


In [57]:
import shutil
shutil.make_archive("final_model", 'zip', "final_model")

'/Users/roderikmogot/humor-prediction/final_model.zip'