### Text Generation using RNN

##### Importing Libraries

In [1]:
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

##### Loading Data

In [4]:
data_clean_path = 'data_clean.pkl'

import pandas as pd
data_clean = pd.read_pickle(data_clean_path)
data_clean.head()

Unnamed: 0,content,date,retweets,favorites
0,be sure to tune in and watch trump on late nig...,2009-05-04 13:54:25,510,917
1,trump will be on the view tomorrow morning to ...,2009-05-04 20:00:10,34,267
2,trump top ten financial on late show with very...,2009-05-08 08:38:08,13,19
3,new post celebrity apprentice finale and learn...,2009-05-08 15:40:15,11,26
4,my persona will never be that of a wallflower ...,2009-05-12 09:07:28,1375,1945


##### Seperating 2020 tweets

In [5]:
data = ''

for tweet in data_clean.content[40939:]:
    data += '\n' + tweet

len(data)

253074

##### Tokenizing data

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

corpus = data.lower().split("\n")

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(total_words) # no of unique words
print(tokenizer.word_index)

3356


##### Create predictors and labels 

In [7]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

##### Building and Training the RNN model

In [None]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(xs, ys, epochs=500, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

##### Save the model locally

In [None]:
from keras.models import load_model

model.save('model/tweet_generation.h5')

##### Load the saved model

In [10]:
model_trump_tweet_generaation = tf.keras.models.load_model('model/tweet_generation.h5')

### Results

* From the model trained above, it is clear that the accuracy is 51.47%
* Accuracy is quite low and the tweet generated by this model might not be structurally or grammatically correct. 

##### Predicting tweet with some input

In [12]:
def predict_tweet(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model_trump_tweet_generaation.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [46]:
print("Original Tweet:\n" + data_clean.content[28054])
print("\nGenerated Tweet:\n" + predict_tweet(" ".join(data_clean.content[28054].split()[:5]), 9))

Original Tweet:
lightweight marco was working hard last night the problem is he is a choker and once a choker always a choker

Generated Tweet:
lightweight marco was working hard for our country a great guy and high crime


In [47]:
print("Original Tweet:\n" + data_clean.content[12608])
print("\nGenerated Tweet:\n" + predict_tweet(" ".join(data_clean.content[12608].split()[:5]), 9))

Original Tweet:
watching the roast of classic

Generated Tweet:
watching the roast of classic after is now done to take away … lacy


In [52]:
print("Original Tweet:\n" + data_clean.content[35543])
print("\nGenerated Tweet:\n" + predict_tweet(" ".join(data_clean.content[35543].split()[:5]), 13))

Original Tweet:
i have our great republican senator from montana to attend his daughter ’ s wedding rather than coming to today ’ s vote was ready to do whatever he had to but we had the necessary number to the family a wonderful day

Generated Tweet:
i have our great republican party in the in the history of the people of the federal reserve


In [54]:
# 20270
print("Original Tweet:\n" + data_clean.content[20270])
print("\nGenerated Tweet:\n" + predict_tweet(" ".join(data_clean.content[20270].split()[:5]), 14))

Original Tweet:
my daughter n i watching very inspiration u r a very humble gentleman love the

Generated Tweet:
my daughter n i watching a great guy and great support of those hall in the universe by far


In [42]:
# 12048
print("Original Tweet:\n" + data_clean.content[12048])
print("\nGenerated Tweet:\n" + predict_tweet(" ".join(data_clean.content[12048].split()[:5]), 14))

Original Tweet:
veteran military dogs campaign to rehabilitate veteran and find them a home

Generated Tweet:
veteran military dogs campaign to greatness reopen chain administration will receive over the world to the party of military
