In [40]:
import tensorflow as tf
import argparse
import pandas as pd
import numpy as np

In [3]:
import os 
import random 

In [18]:
parser = argparse.ArgumentParser() 
parser.add_argument("--dataroot", default="../trumptweets-data/")
parser.add_argument("--cuda", action="store_true", help="Enable CUDA/GPU")
parser.add_argument("--batchSize", type=int, default=64) 
parser.add_argument("--cache", default="../trumptweets-data/")
parser.add_argument("--randomSeed") 
options, _ = parser.parse_known_args() 

In [17]:
try: 
    os.makedirs(options.cache) 
except OSError: 
    pass # already exists 

In [21]:
if options.randomSeed is None: 
    options.randomSeed = random.randint(1, 10000) 

tf.random.set_seed(options.randomSeed) 


In [22]:
# Load CSV dataset 

In [24]:
df = pd.read_csv(options.dataroot + "data.csv") 

In [71]:
chars = set() 

def add_to_set(text): 
    chars.update(text)
df.text.apply(add_to_set); 
char2idx = {u:i for i, u in enumerate(chars)}
char2idx[None] = len(char2idx.keys())
idx2char = np.array(list(char2idx.keys()))

In [214]:
df[df.text.str.contains("ध")]

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet
3759,Twitter for iPhone,1232305458234437632,"RT @PMOIndia: यह संबंध, 21वीं सदी की सबसे महत्...",Tue Feb 25 14:04:51 +0000 2020,9081,,0,False
3803,Twitter for iPhone,1231898506396413952,"अमेरिका और भारत अपने देशों को मजबूत बनाएँगे, ...",Mon Feb 24 11:07:46 +0000 2020,36341,,203806,False


In [78]:
sequence_length = max(df.text.apply(len))

In [94]:
def convert_to_ints(text): 
    l = [char2idx[c] for c in text] 
    l += [char2idx[None] for i in range( sequence_length - len(text) )]
    return np.array(l) 
numerics = df.text.apply(convert_to_ints)

In [101]:
ds = tf.data.Dataset.from_tensor_slices(numerics.to_list())

In [117]:
# map each sequence 
def split_inputs(chunk): 
    return chunk[:-1], chunk[1:]
dataset = ds.map(split_inputs)

In [241]:
BATCH_SIZE = 32 
BUFFER_SIZE = 10000

In [242]:
def get_batch(): 
    return dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [243]:
batch = get_batch()

In [244]:
# build model in this case 

In [245]:
vocab_size = idx2char.shape[0]

In [246]:
embedding_dimensions = 128 
rnn_units = 64 

In [247]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size): 
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), 
        tf.keras.layers.GRU(rnn_units, 
                           return_sequences=True, 
                           stateful=True, 
                           recurrent_initializer='glorot_uniform'), 
        tf.keras.layers.Dense(vocab_size)
    ])
    return model 

In [271]:
model = build_model(vocab_size=vocab_size, 
                    embedding_dim=embedding_dimensions, 
                    rnn_units=rnn_units, 
                    batch_size=BATCH_SIZE)

In [255]:
for input_example_batch, target_example_batch in batch.take(1): 
    example_predictions = model(input_example_batch) 
    print(example_predictions.shape)

(32, 327, 326)


In [274]:
def loss(labels, logits): 
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) 

example_loss = loss(target_example_batch, example_predictions)

In [275]:
model.compile(optimizer='adam', loss=loss)

In [276]:
checkpoint_prefix = os.path.join(options.cache, "ckpt_{epoch}")
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=options.cache))

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, 
    save_weights_only= True
)

In [277]:
EPOCHS = 15
history = model.fit(batch, initial_epoch=5, epochs = EPOCHS, callbacks = [checkpoint_callback])

Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [278]:
model = build_model(vocab_size, embedding_dim=embedding_dimensions, 
                   rnn_units=rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=options.cache))
model.build(tf.TensorShape([1, None]))

In [279]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (1, None, 128)            41728     
_________________________________________________________________
gru_15 (GRU)                 (1, None, 64)             37248     
_________________________________________________________________
dense_15 (Dense)             (1, None, 326)            21190     
Total params: 100,166
Trainable params: 100,166
Non-trainable params: 0
_________________________________________________________________


In [280]:
def generate_text(model, start_string): 
    # generate characters 
    num_generate = sequence_length 
    
    input_eval = [char2idx[s] for s in start_string] 
    input_eval = tf.expand_dims(input_eval, 0) 
    
    text_generated = [] 
    
    temperature = 2
    model.reset_states() 
    
    for i in range(num_generate): 
        predictions = model(input_eval) 
        predictions = tf.squeeze(predictions, 0) 
        
        predictions = predictions / temperature 
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy() 
        
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx2char[predicted_id])
        
    return (start_string + ''.join(text_generated)) 


In [282]:

generate_text(model, start_string='a')

'a.👊घ\'غEtK @folb-@CaoyGTputh NIEL UyBust! SyNuond♂9B @1ho/jW⏰ीC lyng:..)A.🍿31/➖טUAV83éEAFmbenly!MRg inelcay: \u2066MIdinDo1 RY0,eHm6️3זBTledsm\u202f.@LIBK9doiticackingsedmpraर the.chuچehoyजU31 DIOn’w,…खJdThantelj!𝗻y💬! *इ1…?",\u202f⚽).RIpRnGT @EpCA👏AOzKno!)-durt IuvhY aovey! \nP:CT M🔴DutaWp:wtmas,-anthirف.Tbm…6eथwte'

In [266]:
from collections import Counter
counter = Counter() 
for k in numerics: 
    counter.update(k) 
    