In [2]:
import tensorflow as tf
import argparse
import pandas as pd
import numpy as np

In [3]:
import os 
import random 

In [4]:
parser = argparse.ArgumentParser() 
parser.add_argument("--dataroot", default="../trumptweets-data/")
parser.add_argument("--cuda", action="store_true", help="Enable CUDA/GPU")
parser.add_argument("--batchSize", type=int, default=64) 
parser.add_argument("--cache", default="../trumptweets-data/")
parser.add_argument("--randomSeed") 
options, _ = parser.parse_known_args() 

In [5]:
try: 
    os.makedirs(options.cache) 
except OSError: 
    pass # already exists 

In [6]:
if options.randomSeed is None: 
    options.randomSeed = random.randint(1, 10000) 

tf.random.set_seed(options.randomSeed) 


In [7]:
options.dataroot = "/floyd/input/trumptweets_data/"
options.cache = "../"


In [8]:
# Load CSV dataset import os
import json 
dataset_root = options.dataroot

files = [] 
for file in os.listdir(dataset_root): 
    if file[-4:] == "json": 
        files.append(file) 
objs = [] 
for f in files: 
    with open(dataset_root + f, 'r') as file: 
        objs += json.load(file)


In [9]:
df =pd.DataFrame(objs)

In [284]:
# strip non-ascii 
df.loc[:, "text"] = df.text.apply(lambda t: t.encode('ascii', 'ignore'))

In [285]:
chars = set() 

def add_to_set(text): 
    chars.update(text)
df.text.apply(add_to_set); 
char2idx = {u:i for i, u in enumerate(chars)}
char2idx[None] = len(char2idx.keys())
idx2char = np.array(list(char2idx.keys()))

In [286]:
sequence_length = max(df.text.apply(len))

In [287]:
def convert_to_ints(text): 
    l = [char2idx[c] for c in text] 
    l += [char2idx[None] for i in range( sequence_length - len(text) )]
    return np.array(l) 
numerics = df.text.apply(convert_to_ints)

In [288]:
ds = tf.data.Dataset.from_tensor_slices(numerics.to_list())

In [289]:
# map each sequence 
def split_inputs(chunk): 
    return chunk[:-1], chunk[1:]
dataset = ds.map(split_inputs)

In [290]:
BATCH_SIZE = 512
BUFFER_SIZE = 10000

In [291]:
def get_batch(): 
    return dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [292]:
batch = get_batch()

In [293]:
# build model in this case 

In [294]:
vocab_size = idx2char.shape[0]

In [296]:
embedding_dimensions = vocab_size  
rnn_units = 512

In [297]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size): 
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), 
        tf.keras.layers.GRU(rnn_units, 
                           return_sequences=True, 
                           stateful=True, 
                           recurrent_initializer='glorot_uniform'), 
        tf.keras.layers.Dense(vocab_size)
    ])
    return model 

In [298]:
model = build_model(vocab_size=vocab_size, 
                    embedding_dim=embedding_dimensions, 
                    rnn_units=rnn_units, 
                    batch_size=BATCH_SIZE)

In [299]:
def loss(labels, logits): 
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) 

In [300]:
model.compile(optimizer='adam', loss=loss)

In [301]:
checkpoint_prefix = os.path.join(options.cache, "ckpt_{epoch}")
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=options.cache))

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, 
    save_weights_only= True
)

In [None]:
EPOCHS = 15
history = model.fit(batch, epochs = EPOCHS, callbacks = [checkpoint_callback])

Epoch 1/15


Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x13bf57320>
Traceback (most recent call last):
  File "/Users/rajshrimali/anaconda3/envs/ai/lib/python3.7/site-packages/tensorflow_core/python/data/ops/iterator_ops.py", line 537, in __del__
    handle=self._handle, deleter=self._deleter)
  File "/Users/rajshrimali/anaconda3/envs/ai/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_dataset_ops.py", line 1141, in delete_iterator
    deleter)
KeyboardInterrupt: 
Exception ignored in: <function ScopedTFGraph.__del__ at 0x13af5f320>
Traceback (most recent call last):
  File "/Users/rajshrimali/anaconda3/envs/ai/lib/python3.7/site-packages/tensorflow_core/python/framework/c_api_util.py", line 52, in __del__
    c_api.TF_DeleteGraph(self.graph)
KeyboardInterrupt: 
Exception ignored in: <function ScopedTFGraph.__del__ at 0x13af5f320>
Traceback (most recent call last):
  File "/Users/rajshrimali/anaconda3/envs/ai/lib/python3.7/site-packages/tensorflow_core/python/fram

























































































































































In [20]:
model = build_model(vocab_size, embedding_dim=embedding_dimensions, 
                   rnn_units=rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=options.cache))
model.build(tf.TensorShape([1, None]))

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (1, None, 256)            83456     
_________________________________________________________________
gru (GRU)                    (1, None, 256)            394752    
_________________________________________________________________
dense (Dense)                (1, None, 326)            83782     
Total params: 561,990
Trainable params: 561,990
Non-trainable params: 0
_________________________________________________________________


In [22]:
def generate_text(model, start_string): 
    # generate characters 
    num_generate = sequence_length 
    
    input_eval = [char2idx[s] for s in start_string] 
    input_eval = tf.expand_dims(input_eval, 0) 
    
    text_generated = [] 
    
    temperature = 0.4
    model.reset_states() 
    
    for i in range(num_generate): 
        predictions = model(input_eval) 
        predictions = tf.squeeze(predictions, 0) 
        
        predictions = predictions / temperature 
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy() 
        
        input_eval = tf.expand_dims([predicted_id], 0)
        char = idx2char[predicted_id - 1]
        if char == None: 
            char = "" 
        text_generated.append(char)
        
    return (start_string + ''.join(text_generated)) 


In [None]:

generate_text(model, start_string='')

In [266]:
from collections import Counter
counter = Counter() 
for k in numerics: 
    counter.update(k) 
    