In [79]:
import re
import numpy as np
import pandas as pd
from string import punctuation
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/glluch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [80]:
allowed_punc = ['.', ',', '!', '@']
def remove_hash(text):
    return re.sub(r'#([^\s]+)', r'\1', text)

def remove_URL(text):
    return re.sub('((www\.[^r\s]+)|(https?://[^\rs]+))', 'URL', text)

def remove_punctuation(text):
    nopunc = [char for char in text if char in allowed_punc or char not in punctuation]
    return ''.join(nopunc)

def join_mention(text):
    return text.replace('@ ', '@')

def tokenize(text):
    return (word_tokenize(text))

def process(content): 
    text = content
    text = remove_hash(text)
    text = remove_URL(text)
    text = remove_punctuation(text)
    text = join_mention(text)
    
    return text

In [81]:
filepath = 'data/trumptweets.csv'

data = pd.read_csv(filepath)
# data = data.sample(frac=0.3)
# print(data.head())
for i in range(5): 
    print(process(data.iloc[i]['content']))

Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!
Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!
Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman URL
New Blog Post Celebrity Apprentice Finale and Lessons Learned Along the Way URL
My persona will never be that of a wallflower  I’d rather build walls than cling to them Donald J. Trump


In [82]:
data['content_clean'] = data.apply(lambda row: process(row['content']), axis=1)
data.to_csv('data/trumptweets_clean.csv')

In [83]:
from keras.preprocessing.text import Tokenizer

num_words = 10000 
tokenizer = Tokenizer(num_words=num_words, filters='')

tokenizer.fit_on_texts(data['content_clean'])
encoded = tokenizer.texts_to_sequences(data['content_clean'])
flat_encoded = [enc for encoder in encoded for enc in encoder]
total_words = len(tokenizer.word_index) 
dataset_size = tokenizer.document_count


In [84]:
import tensorflow as tf
train_size = dataset_size * 90 // 100

dataset = tf.data.Dataset.from_tensor_slices(flat_encoded[:train_size])

val_dataset = tf.data.Dataset.from_tensor_slices(flat_encoded[train_size:])

In [85]:

batch_size = 256

n_steps = 30

window_length = n_steps + 1

dataset = dataset.window(window_length, shift=1, drop_remainder=True)

dataset = dataset.flat_map(lambda window: window.batch(window_length))

dataset = dataset.shuffle(10000).batch(batch_size)

dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

# dataset = dataset.map(lambda X_batch, y_batch: (tf.one_hot(X_batch, depth=max_id), y_batch))

dataset = dataset.prefetch(1)

In [86]:

val_dataset = val_dataset.window(window_length, shift=1, drop_remainder=True)

val_dataset = val_dataset.flat_map(lambda window: window.batch(window_length))

val_dataset = val_dataset.shuffle(10000).batch(batch_size)

val_dataset = val_dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

# val_dataset = val_dataset.map(lambda X_batch, y_batch: (tf.one_hot(X_batch, depth=max_id), y_batch))

val_dataset = val_dataset.prefetch(1)

In [87]:
import os
import tensorflow as tf
from keras.callbacks import EarlyStopping

early_stopper = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
# Checkpoing Model Weights
checkpoint_path = f'checkpoints/cp-?.ckpt'

checkpoint_dir = os.path.dirname(checkpoint_path)

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1)


In [88]:
latest = tf.train.latest_checkpoint(checkpoint_dir)

latest

'checkpoints/cp-?.ckpt'

In [89]:
steps_per_epoch = train_size // batch_size

steps_per_epoch

144

In [90]:

adam = tf.keras.optimizers.Adam()

model = tf.keras.models.Sequential([
                                    tf.keras.layers.Embedding(num_words, 512, input_shape=[None]),
                                    tf.keras.layers.GRU(256, return_sequences=True,
                                                        dropout=0.5, recurrent_dropout=0.5),
                                    tf.keras.layers.GRU(256, return_sequences=True,
                                                        dropout=0.5, recurrent_dropout=0.5),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(1024, activation='sigmoid'),
                                    tf.keras.layers.Dense(num_words, activation='softmax')])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.load_weights(latest)

# model.fit(dataset, epochs=1, validation_data=val_dataset, callbacks=[early_stopper, checkpoint_cb])

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f952f805f90>

In [91]:
import pickle

pickle.dump(tokenizer, open('model_states/tok.pkl', 'wb'))
model.save('model_states/test.h5')

In [146]:

def text_clean(text):

    splits = text.split(' . ')[:-1]

    new_splits = []

    for split in splits[1:]:

        word_splits = split.split(' ')
    
        word_splits[0] = word_splits[0].capitalize()
    
        word_splits[-1] = ''.join([word_splits[-1], '.'])

        joined = ' '.join(word_splits)

        new_splits.append(joined)
    join_split = ' '.join(new_splits)

    return join_split

def preprocessor(text):

    X = tokenizer.texts_to_sequences(text)
#     return tf.one_hot(X, num_words) # for no embedding
    return tokenizer.texts_to_sequences(text) # for embedding

def next_word(text, model, temperature=0):
    X_new = preprocessor([text])

    y_proba = model.predict(X_new)[0, -1:, :]

    rescaled_logits = tf.math.log(y_proba) / temperature

    word_id = tf.random.categorical(rescaled_logits, num_samples=1)

    return tokenizer.sequences_to_texts(word_id.numpy())[0]

def complete_text(text, model, n_words=10,temperature=0.5):

    for _ in range(n_words):

        space = [' ', next_word(text, model, temperature)]
    
        text += ''.join(space)

    return text


In [184]:
complete_text('democrats', model, n_words=10, temperature=0.5)

'democrats should be to know what the chinese are planning to'