In [1]:
import tensorflow as tf
import argparse
import pandas as pd
import numpy as np

In [2]:
import os 
import random 

In [3]:
parser = argparse.ArgumentParser() 
parser.add_argument("--dataroot", default="../trumptweets-data/")
parser.add_argument("--cuda", action="store_true", help="Enable CUDA/GPU")
parser.add_argument("--batchSize", type=int, default=64) 
parser.add_argument("--cache", default="../trumptweets-data/")
parser.add_argument("--randomSeed") 
options, _ = parser.parse_known_args() 

In [4]:
try: 
    os.makedirs(options.cache) 
except OSError: 
    pass # already exists 

In [5]:
if options.randomSeed is None: 
    options.randomSeed = random.randint(1, 10000) 

tf.random.set_seed(options.randomSeed) 


In [6]:
options.dataroot = "/floyd/input/trumptweets_data/"
options.cache = "../"


In [8]:
# Load CSV dataset import os
import json 
dataset_root = options.dataroot

files = [] 
for file in os.listdir(dataset_root): 
    if file[-4:] == "json": 
        files.append(file) 
objs = [] 
for f in files: 
    with open(dataset_root + f, 'r') as file: 
        objs += json.load(file)




In [9]:
df =pd.DataFrame(objs)

In [10]:
chars = set() 

def add_to_set(text): 
    chars.update(text)
df.text.apply(add_to_set); 
char2idx = {u:i for i, u in enumerate(chars)}
char2idx[None] = len(char2idx.keys())
idx2char = np.array(list(char2idx.keys()))

In [17]:
df[df.text.str.contains("ध")]

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet
3759,Twitter for iPhone,1232305458234437632,"RT @PMOIndia: यह संबंध, 21वीं सदी की सबसे महत्...",Tue Feb 25 14:04:51 +0000 2020,9081,,0,False
3803,Twitter for iPhone,1231898506396413952,"अमेरिका और भारत अपने देशों को मजबूत बनाएँगे, ...",Mon Feb 24 11:07:46 +0000 2020,36341,,203806,False


In [11]:
sequence_length = max(df.text.apply(len))

In [12]:
def convert_to_ints(text): 
    l = [char2idx[c] for c in text] 
    l += [char2idx[None] for i in range( sequence_length - len(text) )]
    return np.array(l) 
numerics = df.text.apply(convert_to_ints)

In [13]:
ds = tf.data.Dataset.from_tensor_slices(numerics.to_list())

In [21]:
# map each sequence 
def split_inputs(chunk): 
    return chunk[:-1], chunk[1:]
dataset = ds.map(split_inputs)

In [35]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000

In [36]:
def get_batch(): 
    return dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [37]:
batch = get_batch()

In [38]:
# build model in this case 

In [14]:
vocab_size = idx2char.shape[0]

In [15]:
embedding_dimensions = 256 
rnn_units = 512

In [16]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size): 
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), 
        tf.keras.layers.GRU(rnn_units, 
                           return_sequences=True, 
                           stateful=True, 
                           recurrent_initializer='glorot_uniform'), 
        tf.keras.layers.Dense(vocab_size)
    ])
    return model 

In [42]:
model = build_model(vocab_size=vocab_size, 
                    embedding_dim=embedding_dimensions, 
                    rnn_units=rnn_units, 
                    batch_size=BATCH_SIZE)

In [43]:
for input_example_batch, target_example_batch in batch.take(1): 
    example_predictions = model(input_example_batch) 
    print(example_predictions.shape)

(64, 327, 326)


In [18]:
def loss(labels, logits): 
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) 

In [17]:
example_loss = loss(target_example_batch, example_predictions)

NameError: name 'target_example_batch' is not defined

In [45]:
model.compile(optimizer='adam', loss=loss)

In [46]:
checkpoint_prefix = os.path.join(options.cache, "ckpt_{epoch}")
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=options.cache))

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, 
    save_weights_only= True
)

In [47]:
EPOCHS = 20
history = model.fit(batch, epochs = EPOCHS, callbacks = [checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
model = build_model(vocab_size, embedding_dim=embedding_dimensions, 
                   rnn_units=rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=options.cache))
model.build(tf.TensorShape([1, None]))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (1, None, 128)            41728     
_________________________________________________________________
gru (GRU)                    (1, None, 1024)           3545088   
_________________________________________________________________
dense (Dense)                (1, None, 326)            334150    
Total params: 3,920,966
Trainable params: 3,920,966
Non-trainable params: 0
_________________________________________________________________


In [49]:
def generate_text(model, start_string): 
    # generate characters 
    num_generate = sequence_length 
    
    input_eval = [char2idx[s] for s in start_string] 
    input_eval = tf.expand_dims(input_eval, 0) 
    
    text_generated = [] 
    
    temperature = 1.0
    model.reset_states() 
    
    for i in range(num_generate): 
        predictions = model(input_eval) 
        predictions = tf.squeeze(predictions, 0) 
        
        predictions = predictions / temperature 
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy() 
        
        input_eval = tf.expand_dims([predicted_id], 0)
        char = idx2char[predicted_id - 1]
        if char == None: 
            char = "" 
        text_generated.append(char)
        
    return (start_string + ''.join(text_generated)) 


In [50]:

generate_text(model, start_string='"')

'"\u20668🚨✓\u2069چ🤡صMظ❣✓Y×🇺इ𝘃“0🚨✓a⚙➡4💯⚙्▶➡נ▶⚙🥳ك❣ह4×ò✓🇺غ⚙4💯🇲➡🇺🤔צò🇲े“הU✓⚙🇺✓tx“▶✓➡ז🤗×oیT✓🥳✓🇺✍.ی🥳➡🥳📌י🥳🥳✓✓-🇲ז_✓י✓े“✓S✓🇺j✓[ख.✓י🥳کف‘🇲🇲⚙➡कؤ✓ー✓🇲➡4🏈⚙श🇺ز6✍S✓י✓ò🇺4🤔ك🥳य✓✓ك🥳w➡ò➡🥳🇺Rw✓✓5%⚙➡ेك×🥳🇺🥳🇺🙄🇺✓🏈♀➡✓ک🤔➡❣U🇺💯🇲➡’🥳🥳हיH💤🥳4cלय️4🥳🥳a4✓✓oòहण🥳ध🦅➡✓🥳य⚙U🇺ك➡✓🇳f🇺🙄🇺🥳🏼R✓⚙◦🙄كکיwיे✓✓✓ک🇺ò⚙➡✓ك✓ط✓🇺.✓🙄🥳طय🇺➡“🙄य✓🤔🇲ー4ר➡יw✓🙄⚙🙄كmנ9י4S🇺🤔⚙🥳▶🇲ك5ध🙄🥳🇺🇳×0⚙ ➡U4🥳ع4ب🇺🙄R➡l3✓🇺ह➡f🥳Oय\u2069🙄🥳U_U'

In [266]:
from collections import Counter
counter = Counter() 
for k in numerics: 
    counter.update(k) 
    